Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
81 commits
Select commit Hold shift + click to select a range
3e55f99
[SYSTEMDS-3644] Compressed transform encode
Baunsgaard Dec 30, 2024
86069c6
Perf Transform Encode
Baunsgaard Jan 13, 2025
11472b3
cleanup
Baunsgaard Jan 13, 2025
0a70ce4
multi column
Baunsgaard Jan 13, 2025
33fbc94
nowWithCompressed Input
Baunsgaard Jan 13, 2025
793eca9
more
Baunsgaard Jan 13, 2025
9a88e66
better count ubyte
Baunsgaard Jan 13, 2025
1a5b6e9
count by 8 byte
Baunsgaard Jan 13, 2025
f9c14fb
by8 count
Baunsgaard Jan 13, 2025
b33dd7e
safety tk settings
Baunsgaard Jan 13, 2025
59ca242
fix hash nnz
Baunsgaard Jan 13, 2025
0458808
HashMapToInt
Baunsgaard Jan 13, 2025
f16abe9
fix merge Recode
Baunsgaard Jan 13, 2025
fb325a0
only compressed
Baunsgaard Jan 13, 2025
01a06a5
fix recodeMapTest
Baunsgaard Jan 13, 2025
48135da
another minor fix
Baunsgaard Jan 13, 2025
487585a
unsafe
Baunsgaard Jan 13, 2025
4f32ca9
unsafe
Baunsgaard Jan 13, 2025
b859d4b
unsafe and safe
Baunsgaard Jan 13, 2025
058187f
chars buff
Baunsgaard Jan 13, 2025
67ad0d8
map to char refine
Baunsgaard Jan 13, 2025
3683378
restore log output
Baunsgaard Jan 13, 2025
b6933d4
otherBranch Logging
Baunsgaard Jan 13, 2025
0f45fe0
not really a change
Baunsgaard Jan 13, 2025
6744bf8
Binary readers update
Baunsgaard Jan 13, 2025
4f75d4c
compressed writer
Baunsgaard Jan 13, 2025
696c9c4
we try this optimization
Baunsgaard Jan 13, 2025
0cd1d8d
gammaSquared
Baunsgaard Jan 13, 2025
1d93f2f
reduce precalculated
Baunsgaard Jan 13, 2025
ce98558
entry set for HasMapToInt
Baunsgaard Jan 13, 2025
987de09
compress without specifying unique
Baunsgaard Jan 13, 2025
9bf572f
sampled all
Baunsgaard Jan 13, 2025
d0d09dc
dict writable
Baunsgaard Jan 14, 2025
280bd1f
not placeholder
Baunsgaard Jan 14, 2025
b60b1ea
fix csv metadata parsing
Baunsgaard Jan 14, 2025
7b2b420
license
Baunsgaard Jan 14, 2025
2ad1bf7
fix contains key
Baunsgaard Jan 14, 2025
b22f744
writing to disk is painfull
Baunsgaard Jan 14, 2025
4513643
more tests for custom array
Baunsgaard Jan 14, 2025
3203299
sum remove on combine
Baunsgaard Jan 14, 2025
8f4c716
combine uncompressed
Baunsgaard Jan 14, 2025
661b186
combine uncompressed error
Baunsgaard Jan 14, 2025
7bedb88
more functions
Baunsgaard Jan 14, 2025
3e983e4
simple version
Baunsgaard Jan 14, 2025
b1f7e6e
more logging
Baunsgaard Jan 14, 2025
613f397
logging
Baunsgaard Jan 14, 2025
4172e64
safety fix
Baunsgaard Jan 14, 2025
4758bce
only add nnz if combined
Baunsgaard Jan 14, 2025
495be27
fixes
Baunsgaard Jan 14, 2025
115fb90
bad logging add
Baunsgaard Jan 14, 2025
30d2d18
debugging move
Baunsgaard Jan 14, 2025
17ad997
why?
Baunsgaard Jan 14, 2025
847c01b
no longer an extension of Uncompressed
Baunsgaard Jan 14, 2025
099779a
copy and set
Baunsgaard Jan 14, 2025
56c2bec
count nnz
Baunsgaard Jan 14, 2025
1a150d3
remove logging
Baunsgaard Jan 14, 2025
fbee615
parallel putinto
Baunsgaard Jan 14, 2025
d39895a
count nnz
Baunsgaard Jan 14, 2025
578308a
set nnz
Baunsgaard Jan 14, 2025
6cd34c2
better parallelization
Baunsgaard Jan 14, 2025
2e7b301
SINGLE COL TMP INDEX
Baunsgaard Jan 14, 2025
a449d50
try ?
Baunsgaard Jan 14, 2025
110dec8
timing of combining
Baunsgaard Jan 14, 2025
4c0f7eb
compressed size ... even if abort
Baunsgaard Jan 14, 2025
51dc823
parallel
Baunsgaard Jan 15, 2025
7e746bf
more JIT
Baunsgaard Jan 15, 2025
c52ad51
keynull
Baunsgaard Jan 15, 2025
782245c
again
Baunsgaard Jan 15, 2025
12c6d1a
resizing
Baunsgaard Jan 15, 2025
ebb3555
fix cast to Integer
Baunsgaard Jan 15, 2025
a2bc526
inverse
Baunsgaard Jan 15, 2025
d7ee209
remove Integer.valueOf
Baunsgaard Jan 15, 2025
b5fbce9
... that was stupid
Baunsgaard Jan 15, 2025
0b45f8d
fix resize to correctly reflect size
Baunsgaard Jan 15, 2025
ea6ad18
remove imports
Baunsgaard Jan 15, 2025
114c170
fix column offsets
Baunsgaard Jan 15, 2025
fed33ec
correct character array memory
Baunsgaard Jan 15, 2025
f2756c6
transform full perf
Baunsgaard Jan 15, 2025
0221af0
reduce
Baunsgaard Jan 15, 2025
3fe379a
repeat passthrough
Baunsgaard Jan 15, 2025
3487c88
transform revert to cover all
Baunsgaard Jan 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ private void classifyPhase() {
LOG.info("Threshold was set to : " + threshold + " but it was above original " + _stats.originalCost);
LOG.info("Original size : " + _stats.originalSize);
LOG.info("single col size : " + _stats.estimatedSizeCols);
LOG.debug(String.format("--compressed size: %16d", _stats.originalSize));
if(!(costEstimator instanceof MemoryCostEstimator)) {
LOG.info("original cost : " + _stats.originalCost);
LOG.info("single col cost : " + _stats.estimatedCostCols);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,8 @@ public Object call() throws Exception {
final int maxCombined = c1i.getNumVals() * c2i.getNumVals();

if(maxCombined < 0 // int overflow
|| maxCombined > c1i.getNumRows()) // higher combined than number of rows.
|| maxCombined > c1i.getNumRows() // higher than number of rows
|| maxCombined > 100000) // higher than 100k ... then lets not precalculate it.
return null;

final IColIndex c = _c1._indexes.combine(_c2._indexes);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,9 @@ public static long getExactSizeOnDisk(List<AColGroup> colGroups) {
}
ret += grp.getExactSizeOnDisk();
}
LOG.error(" duplicate dicts on exact Size on Disk : " + (colGroups.size() - dicts.size()) );
if(LOG.isWarnEnabled())
LOG.warn(" duplicate dicts on exact Size on Disk : " + (colGroups.size() - dicts.size()) );

return ret;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory;
import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
import org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary;
import org.apache.sysds.runtime.compress.colgroup.dictionary.PlaceHolderDict;
import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
Expand Down Expand Up @@ -93,7 +94,7 @@ public static AColGroup create(IColIndex colIndices, int numRows, IDictionary di
int[] cachedCounts) {
if(dict == null)
return new ColGroupEmpty(colIndices);
else if(data.getUnique() == 1) {
else if(data.getUnique() == 1 && !(dict instanceof PlaceHolderDict)) {
MatrixBlock mb = dict.getMBDict(colIndices.size()).getMatrixBlock().slice(0, 0);
return ColGroupSDCSingleZeros.create(colIndices, numRows, MatrixBlockDictionary.create(mb), offsets, null);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,12 @@ public class ColGroupUncompressed extends AColGroup {
*/
private final MatrixBlock _data;

private ColGroupUncompressed(MatrixBlock mb, IColIndex colIndexes) {
/**
* Do not use this constructor of column group uncompressed, instead uce the create constructor.
* @param mb The contained data.
* @param colIndexes Column indexes for this Columngroup
*/
protected ColGroupUncompressed(MatrixBlock mb, IColIndex colIndexes) {
super(colIndexes);
_data = mb;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.sysds.runtime.compress.colgroup;

import org.apache.sysds.runtime.compress.colgroup.ColGroupUtils.P;
import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
import org.apache.sysds.runtime.compress.colgroup.scheme.ICLAScheme;
import org.apache.sysds.runtime.compress.cost.ComputationCostEstimator;
import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
import org.apache.sysds.runtime.data.DenseBlock;
import org.apache.sysds.runtime.data.SparseBlock;
import org.apache.sysds.runtime.data.SparseBlockMCSR;
import org.apache.sysds.runtime.frame.data.columns.Array;
import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
import org.apache.sysds.runtime.matrix.data.MatrixBlock;
import org.apache.sysds.runtime.matrix.operators.AggregateUnaryOperator;
import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
import org.apache.sysds.runtime.matrix.operators.CMOperator;
import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
import org.apache.sysds.runtime.matrix.operators.UnaryOperator;

/**
* Special sideways compressed column group not supposed to be used outside of the compressed transform encode.
*/
public class ColGroupUncompressedArray extends AColGroup {
private static final long serialVersionUID = -825423333043292199L;

public final Array<?> array;
public final int id; // columnID

public ColGroupUncompressedArray(Array<?> data, int id, IColIndex colIndexes) {
super(colIndexes);
this.array = data;
this.id = id;
}

@Override
public int getNumValues() {
return array.size();
}

@Override
public long estimateInMemorySize() {
// not accurate estimate, but guaranteed larger.
return MatrixBlock.estimateSizeInMemory(array.size(), 1, array.size()) + 80;
}

@Override
public String toString() {
return "UncompressedArrayGroup: " + id + " " + _colIndexes;
}

@Override
public AColGroup copyAndSet(IColIndex colIndexes) {
return new ColGroupUncompressedArray(array, id, colIndexes);
}

@Override
public void decompressToDenseBlockTransposed(DenseBlock db, int rl, int ru) {
throw new UnsupportedOperationException("Unimplemented method 'decompressToDenseBlockTransposed'");
}

@Override
public void decompressToSparseBlockTransposed(SparseBlockMCSR sb, int nColOut) {
throw new UnsupportedOperationException("Unimplemented method 'decompressToSparseBlockTransposed'");
}

@Override
public double getIdx(int r, int colIdx) {
throw new UnsupportedOperationException("Unimplemented method 'getIdx'");
}

@Override
public CompressionType getCompType() {
throw new UnsupportedOperationException("Unimplemented method 'getCompType'");
}

@Override
protected ColGroupType getColGroupType() {
throw new UnsupportedOperationException("Unimplemented method 'getColGroupType'");
}

@Override
public void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int offC) {
throw new UnsupportedOperationException("Unimplemented method 'decompressToDenseBlock'");
}

@Override
public void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC) {
throw new UnsupportedOperationException("Unimplemented method 'decompressToSparseBlock'");
}

@Override
public AColGroup rightMultByMatrix(MatrixBlock right, IColIndex allCols, int k) {
throw new UnsupportedOperationException("Unimplemented method 'rightMultByMatrix'");
}

@Override
public void tsmm(MatrixBlock ret, int nRows) {
throw new UnsupportedOperationException("Unimplemented method 'tsmm'");
}

@Override
public void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock result, int rl, int ru, int cl, int cu) {
throw new UnsupportedOperationException("Unimplemented method 'leftMultByMatrixNoPreAgg'");
}

@Override
public void leftMultByAColGroup(AColGroup lhs, MatrixBlock result, int nRows) {
throw new UnsupportedOperationException("Unimplemented method 'leftMultByAColGroup'");
}

@Override
public void tsmmAColGroup(AColGroup other, MatrixBlock result) {
throw new UnsupportedOperationException("Unimplemented method 'tsmmAColGroup'");
}

@Override
public AColGroup scalarOperation(ScalarOperator op) {
throw new UnsupportedOperationException("Unimplemented method 'scalarOperation'");
}

@Override
public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) {
throw new UnsupportedOperationException("Unimplemented method 'binaryRowOpLeft'");
}

@Override
public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe) {
throw new UnsupportedOperationException("Unimplemented method 'binaryRowOpRight'");
}

@Override
public void unaryAggregateOperations(AggregateUnaryOperator op, double[] c, int nRows, int rl, int ru) {
throw new UnsupportedOperationException("Unimplemented method 'unaryAggregateOperations'");
}

@Override
protected AColGroup sliceSingleColumn(int idx) {
throw new UnsupportedOperationException("Unimplemented method 'sliceSingleColumn'");
}

@Override
protected AColGroup sliceMultiColumns(int idStart, int idEnd, IColIndex outputCols) {
throw new UnsupportedOperationException("Unimplemented method 'sliceMultiColumns'");
}

@Override
public AColGroup sliceRows(int rl, int ru) {
throw new UnsupportedOperationException("Unimplemented method 'sliceRows'");
}

@Override
public double getMin() {
throw new UnsupportedOperationException("Unimplemented method 'getMin'");
}

@Override
public double getMax() {
throw new UnsupportedOperationException("Unimplemented method 'getMax'");
}

@Override
public double getSum(int nRows) {
throw new UnsupportedOperationException("Unimplemented method 'getSum'");
}

@Override
public boolean containsValue(double pattern) {
throw new UnsupportedOperationException("Unimplemented method 'containsValue'");
}

@Override
public long getNumberNonZeros(int nRows) {
throw new UnsupportedOperationException("Unimplemented method 'getNumberNonZeros'");
}

@Override
public AColGroup replace(double pattern, double replace) {
throw new UnsupportedOperationException("Unimplemented method 'replace'");
}

@Override
public void computeColSums(double[] c, int nRows) {
throw new UnsupportedOperationException("Unimplemented method 'computeColSums'");
}

@Override
public CM_COV_Object centralMoment(CMOperator op, int nRows) {
throw new UnsupportedOperationException("Unimplemented method 'centralMoment'");
}

@Override
public AColGroup rexpandCols(int max, boolean ignore, boolean cast, int nRows) {
throw new UnsupportedOperationException("Unimplemented method 'rexpandCols'");
}

@Override
public double getCost(ComputationCostEstimator e, int nRows) {
throw new UnsupportedOperationException("Unimplemented method 'getCost'");
}

@Override
public AColGroup unaryOperation(UnaryOperator op) {
throw new UnsupportedOperationException("Unimplemented method 'unaryOperation'");
}

@Override
public boolean isEmpty() {
throw new UnsupportedOperationException("Unimplemented method 'isEmpty'");
}

@Override
public AColGroup append(AColGroup g) {
throw new UnsupportedOperationException("Unimplemented method 'append'");
}

@Override
protected AColGroup appendNInternal(AColGroup[] groups, int blen, int rlen) {
throw new UnsupportedOperationException("Unimplemented method 'appendNInternal'");
}

@Override
public ICLAScheme getCompressionScheme() {
throw new UnsupportedOperationException("Unimplemented method 'getCompressionScheme'");
}

@Override
public AColGroup recompress() {
throw new UnsupportedOperationException("Unimplemented method 'recompress'");
}

@Override
public CompressedSizeInfoColGroup getCompressionInfo(int nRow) {
throw new UnsupportedOperationException("Unimplemented method 'getCompressionInfo'");
}

@Override
protected AColGroup fixColIndexes(IColIndex newColIndex, int[] reordering) {
throw new UnsupportedOperationException("Unimplemented method 'fixColIndexes'");
}

@Override
public AColGroup reduceCols() {
throw new UnsupportedOperationException("Unimplemented method 'reduceCols'");
}

@Override
public double getSparsity() {
throw new UnsupportedOperationException("Unimplemented method 'getSparsity'");
}

@Override
protected void sparseSelection(MatrixBlock selection, P[] points, MatrixBlock ret, int rl, int ru) {
throw new UnsupportedOperationException("Unimplemented method 'sparseSelection'");
}

@Override
protected void denseSelection(MatrixBlock selection, P[] points, MatrixBlock ret, int rl, int ru) {
throw new UnsupportedOperationException("Unimplemented method 'denseSelection'");
}

@Override
public AColGroup[] splitReshape(int multiplier, int nRow, int nColOrg) {
throw new UnsupportedOperationException("Unimplemented method 'splitReshape'");
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ else if(contiguous)
return ArrayIndex.estimateInMemorySizeStatic(nCol);
}

public static IColIndex combine(List<AColGroup> gs) {
public static IColIndex combine(List<? extends AColGroup> gs) {
int numCols = 0;
for(AColGroup g : gs)
numCols += g.getNumCols();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,11 +166,26 @@ public void copyBit(MapToBit d) {

@Override
public int[] getCounts(int[] ret) {
for(int i = 0; i < _data.length; i++)
final int h = (_data.length) % 8;
for(int i = 0; i < h; i++)
ret[_data[i] & 0xFF]++;
getCountsBy8P(ret, h, _data.length);
return ret;
}

private void getCountsBy8P(int[] ret, int s, int e) {
for(int i = s; i < e; i += 8) {
ret[_data[i] & 0xFF]++;
ret[_data[i + 1] & 0xFF]++;
ret[_data[i + 2] & 0xFF]++;
ret[_data[i + 3] & 0xFF]++;
ret[_data[i + 4] & 0xFF]++;
ret[_data[i + 5] & 0xFF]++;
ret[_data[i + 6] & 0xFF]++;
ret[_data[i + 7] & 0xFF]++;
}
}

@Override
protected void preAggregateDenseToRowBy8(double[] mV, double[] preAV, int cl, int cu, int off) {
final int h = (cu - cl) % 8;
Expand Down
Loading
Loading