libfsst.cpp source code [Velox/build/_deps/duckdb-src/third_party/fsst/libfsst.cpp]

1	// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
2	//
3	// Copyright 2018-2020, CWI, TU Munich, FSU Jena
4	//
5	// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
6	// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
7	// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
8	// furnished to do so, subject to the following conditions:
9	//
10	// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
11	//
12	// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
13	// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
14	// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
15	// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
16	//
17	// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
18	#include "libfsst.hpp"
19
20	Symbol concat(Symbol a, Symbol b) {
21	Symbol s;
22	u32 length = a.length()+b.length();
23	if (length > Symbol::maxLength) length = Symbol::maxLength;
24	s.set_code_len(FSST_CODE_MASK, len: length);
25	s.val.num = (b.val.num << (`8`*a.length())) \| a.val.num;
26	return s;
27	}
28
29	namespace std {
30	template <>
31	class hash<QSymbol> {
32	public:
33	size_t operator()(const QSymbol& q) const {
34	uint64_t k = q.symbol.val.num;
35	const uint64_t m = `0xc6a4a7935bd1e995`;
36	const int r = `47`;
37	uint64_t h = `0x8445d61a4e774912` ^ (`8`*m);
38	k *= m;
39	k ^= k >> r;
40	k *= m;
41	h ^= k;
42	h *= m;
43	h ^= h >> r;
44	h *= m;
45	h ^= h >> r;
46	return h;
47	}
48	};
49	}
50
51	bool isEscapeCode(u16 pos) { return pos < FSST_CODE_BASE; }
52
53	std::ostream& operator<<(std::ostream& out, const Symbol& s) {
54	for (u32 i=`0`; i<s.length(); i++)
55	out << s.val.str[i];
56	return out;
57	}
58
59	SymbolTable buildSymbolTable(Counters& counters, vector<u8> line, size_t len[], bool zeroTerminated=false) {
60	SymbolTable st = new* SymbolTable (), bestTable = new* SymbolTable ();
61	int bestGain = (int) -FSST_SAMPLEMAXSZ; // worst case (everything exception)
62	size_t sampleFrac = `128`;
63
64	// start by determining the terminator. We use the (lowest) most infrequent byte as terminator
65	st->zeroTerminated = zeroTerminated;
66	if (zeroTerminated) {
67	st->terminator = `0`; // except in case of zeroTerminated mode, then byte 0 is terminator regardless frequency
68	} else {
69	u16 byteHisto[`256`];
70	memset(s: byteHisto, c: `0`, n: sizeof(byteHisto));
71	for(size_t i=`0`; i<line.size(); i++) {
72	u8* cur = line [i];
73	u8* end = cur + len[i];
74	while(cur < end) byteHisto[*cur++]++;
75	}
76	u32 minSize = FSST_SAMPLEMAXSZ, i = st->terminator = `256`;
77	while(i-- > `0`) {
78	if (byteHisto[i] > minSize) continue;
79	st->terminator = i;
80	minSize = byteHisto[i];
81	}
82	}
83	assert(st->terminator != `256`);
84
85	// a random number between 0 and 128
86	auto rnd128 = [&](size_t i) { return `1` + (FSST_HASH((i+`1UL`)*sampleFrac)&`127`); };
87
88	// compress sample, and compute (pair-)frequencies
89	auto compressCount = [&](SymbolTable st, Counters &counters) { // returns gain*
90	int gain = `0`;
91
92	for(size_t i=`0`; i<line.size(); i++) {
93	u8* cur = line [i];
94	u8* end = cur + len[i];
95
96	if (sampleFrac < `128`) {
97	// in earlier rounds (sampleFrac < 128) we skip data in the sample (reduces overall work ~2x)
98	if (rnd128 (i) > sampleFrac) continue;
99	}
100	if (cur < end) {
101	u8* start = cur;
102	u16 code2 = `255`, code1 = st->findLongestSymbol(cur, end);
103	cur += st->symbols[code1].length();
104	gain += (int) (st->symbols[code1].length()-(`1`+isEscapeCode(pos: code1)));
105	while (true) {
106	// count single symbol (i.e. an option is not extending it)
107	counters.count1Inc(pos1: code1);
108
109	// as an alternative, consider just using the next byte..
110	if (st->symbols[code1].length() != `1`) // .. but do not count single byte symbols doubly
111	counters.count1Inc(pos1: *start);
112
113	if (cur==end) {
114	break;
115	}
116
117	// now match a new symbol
118	start = cur;
119	if (cur<end-`7`) {
120	u64 word = fsst_unaligned_load(V: cur);
121	size_t code = word & `0xFFFFFF`;
122	size_t idx = FSST_HASH(code)&(st->hashTabSize-`1`);
123	Symbol s = st->hashTab[idx];
124	code2 = st->shortCodes[word & `0xFFFF`] & FSST_CODE_MASK;
125	word &= (`0xFFFFFFFFFFFFFFFF` >> (u8) s.icl);
126	if ((s.icl < FSST_ICL_FREE) & (s.val.num == word)) {
127	code2 = s.code();
128	cur += s.length();
129	} else if (code2 >= FSST_CODE_BASE) {
130	cur += `2`;
131	} else {
132	code2 = st->byteCodes[word & `0xFF`] & FSST_CODE_MASK;
133	cur += `1`;
134	}
135	} else {
136	code2 = st->findLongestSymbol(cur, end);
137	cur += st->symbols[code2].length();
138	}
139
140	// compute compressed output size
141	gain += ((int) (cur-start))-(`1`+isEscapeCode(pos: code2));
142
143	// now count the subsequent two symbols we encode as an extension codesibility
144	if (sampleFrac < `128`) { // no need to count pairs in final round
145	// consider the symbol that is the concatenation of the two last symbols
146	counters.count2Inc(pos1: code1, pos2: code2);
147
148	// as an alternative, consider just extending with the next byte..
149	if ((cur-start) > `1`) // ..but do not count single byte extensions doubly
150	counters.count2Inc(pos1: code1, pos2: *start);
151	}
152	code1 = code2;
153	}
154	}
155	}
156	return gain;
157	};
158
159	auto makeTable = [&](SymbolTable *st, Counters &counters) {
160	// hashmap of c (needed because we can generate duplicate candidates)
161	unordered_set<QSymbol> cands;
162
163	// artificially make terminater the most frequent symbol so it gets included
164	u16 terminator = st->nSymbols?FSST_CODE_BASE:st->terminator;
165	counters.count1Set(pos1: terminator,val: `65535`);
166
167	auto addOrInc = [&](unordered_set<QSymbol> &cands, Symbol s, u64 count) {
168	if (count < (`5`sampleFrac)/`128`) return; // improves both compression speed (less candidates), but also quality!!*
169	QSymbol q;
170	q.symbol = s;
171	q.gain = count * s.length();
172	auto it = cands.find(x: q);
173	if (it != cands.end()) {
174	q.gain += (*it).gain;
175	cands.erase(x: *it);
176	}
177	cands.insert(x: q);
178	};
179
180	// add candidate symbols based on counted frequency
181	for (u32 pos1=`0`; pos1<FSST_CODE_BASE+(size_t) st->nSymbols; pos1++) {
182	u32 cnt1 = counters.count1GetNext(pos1); // may advance pos1!!
183	if (!cnt1) continue;
184
185	// heuristic: promoting single-byte symbols (8) helps reduce exception rates and increases [de]compression speed*
186	Symbol s1 = st->symbols[pos1];
187	addOrInc(cands, s1, ((s1.length()==`1`)?`8LL`:`1LL`)*cnt1);
188
189	if (sampleFrac >= `128` \|\| // last round we do not create new (combined) symbols
190	s1.length() == Symbol::maxLength \|\| // symbol cannot be extended
191	s1.val.str[`0`] == st->terminator) { // multi-byte symbols cannot contain the terminator byte
192	continue;
193	}
194	for (u32 pos2=`0`; pos2<FSST_CODE_BASE+(size_t)st->nSymbols; pos2++) {
195	u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!!
196	if (!cnt2) continue;
197
198	// create a new symbol
199	Symbol s2 = st->symbols[pos2];
200	Symbol s3 = concat(a: s1, b: s2);
201	if (s2.val.str[`0`] != st->terminator) // multi-byte symbols cannot contain the terminator byte
202	addOrInc(cands, s3, cnt2);
203	}
204	}
205
206	// insert candidates into priority queue (by gain)
207	auto cmpGn = [](const QSymbol& q1, const QSymbol& q2) { return (q1.gain < q2.gain) \|\| (q1.gain == q2.gain && q1.symbol.val.num > q2.symbol.val.num); };
208	priority_queue<QSymbol,vector<QSymbol>,decltype(cmpGn)> pq(cmpGn);
209	for (auto& q : cands)
210	pq.push(x: q);
211
212	// Create new symbol map using best candidates
213	st->clear();
214	while (st->nSymbols < `255` && !pq.empty()) {
215	QSymbol q = pq.top();
216	pq.pop();
217	st->add(s: q.symbol);
218	}
219	};
220
221	u8 bestCounters[`512`*sizeof(u16)];
222	#ifdef NONOPT_FSST
223	for(size_t frac : {`127`, `127`, `127`, `127`, `127`, `127`, `127`, `127`, `127`, `128`}) {
224	sampleFrac = frac;
225	#else
226	for(sampleFrac=`8`; true; sampleFrac += `30`) {
227	#endif
228	memset(s: &counters, c: `0`, n: sizeof(Counters));
229	long gain = compressCount (st, counters);
230	if (gain >= bestGain) { // a new best solution!
231	counters.backup1(buf: bestCounters);
232	bestTable = st; bestGain = gain;
233	}
234	if (sampleFrac >= `128`) break; // we do 5 rounds (sampleFrac=8,38,68,98,128)
235	makeTable (st, counters);
236	}
237	delete st;
238	counters.restore1(buf: bestCounters);
239	makeTable (bestTable, counters);
240	bestTable->finalize(zeroTerminated); // renumber codes for more efficient compression
241	return bestTable;
242	}
243
244	static inline size_t compressSIMD(SymbolTable &symbolTable, u8* symbolBase, size_t nlines, size_t len[], u8* line[], size_t size, u8* dst, size_t lenOut[], u8* strOut[], int unroll) {
245	size_t curLine = `0`, inOff = `0`, outOff = `0`, batchPos = `0`, empty = `0`, budget = size;
246	u8 lim = dst + size, codeBase = symbolBase + (`1`<<`18`); // 512KB temp space for compressing 512 strings
247	SIMDjob input[`512`]; // combined offsets of input strings (cur,end), and string #id (pos) and output (dst) pointer
248	SIMDjob output[`512`]; // output are (pos:9,dst:19) end pointers (compute compressed length from this)
249	size_t jobLine[`512`]; // for which line in the input sequence was this job (needed because we may split a line into multiple jobs)
250
251	while (curLine < nlines && outOff <= (`1`<<`19`)) {
252	size_t prevLine = curLine, chunk, curOff = `0`;
253
254	// bail out if the output buffer cannot hold the compressed next string fully
255	if (((len[curLine]-curOff)`2` + `7`) > budget) break; // see below for the +7*
256	else budget -= (len[curLine]-curOff)*`2`;
257
258	strOut[curLine] = (u8*) `0`;
259	lenOut[curLine] = `0`;
260
261	do {
262	do {
263	chunk = len[curLine] - curOff;
264	if (chunk > `511`) {
265	chunk = `511`; // large strings need to be chopped up into segments of 511 bytes
266	}
267	// create a job in this batch
268	SIMDjob job;
269	job.cur = inOff;
270	job.end = job.cur + chunk;
271	job.pos = batchPos;
272	job.out = outOff;
273
274	// worst case estimate for compressed size (+7 is for the scatter that writes extra 7 zeros)
275	outOff += `7` + `2`(size_t)(job.end - job.cur); // note, total size needed is 512(5112+7) bytes.*
276	if (outOff > (`1`<<`19`)) break; // simdbuf may get full, stop before this chunk
277
278	// register job in this batch
279	input[batchPos] = job;
280	jobLine[batchPos] = curLine;
281
282	if (chunk == `0`) {
283	empty++; // detect empty chunks -- SIMD code cannot handle empty strings, so they need to be filtered out
284	} else {
285	// copy string chunk into temp buffer
286	memcpy(dest: symbolBase + inOff, src: line[curLine] + curOff, n: chunk);
287	inOff += chunk;
288	curOff += chunk;
289	symbolBase[inOff++] = (u8) symbolTable.terminator; // write an extra char at the end that will not be encoded
290	}
291	if (++batchPos == `512`) break;
292	} while(curOff < len[curLine]);
293
294	if ((batchPos == `512`) \|\| (outOff > (`1`<<`19`)) \|\| (++curLine >= nlines)) { // cannot accumulate more?
295	if (batchPos-empty >= `32`) { // if we have enough work, fire off fsst_compressAVX512 (32 is due to max 4x8 unrolling)
296	// radix-sort jobs on length (longest string first)
297	// -- this provides best load balancing and allows to skip empty jobs at the end
298	u16 sortpos[`513`];
299	memset(s: sortpos, c: `0`, n: sizeof(sortpos));
300
301	// calculate length histo
302	for(size_t i=`0`; i<batchPos; i++) {
303	size_t len = input[i].end - input[i].cur;
304	sortpos[`512UL` - len]++;
305	}
306	// calculate running sum
307	for(size_t i=`1`; i<=`512`; i++)
308	sortpos[i] += sortpos[i-`1`];
309
310	// move jobs to their final destination
311	SIMDjob inputOrdered[`512`];
312	for(size_t i=`0`; i<batchPos; i++) {
313	size_t len = input[i].end - input[i].cur;
314	size_t pos = sortpos[`511UL` - len]++;
315	inputOrdered[pos] = input[i];
316	}
317	// finally.. SIMD compress max 256KB of simdbuf into (max) 512KB of simdbuf (but presumably much less..)
318	for(size_t done = duckdb_fsst_compressAVX512(symbolTable, codeBase, symbolBase, input: inputOrdered, output, n: batchPos-empty, unroll);
319	done < batchPos; done++) output[done] = inputOrdered[done];
320	} else {
321	memcpy(dest: output, src: input, n: batchPos*sizeof(SIMDjob));
322	}
323
324	// finish encoding (unfinished strings in process, plus the few last strings not yet processed)
325	for(size_t i=`0`; i<batchPos; i++) {
326	SIMDjob job = output[i];
327	if (job.cur < job.end) { // finish encoding this string with scalar code
328	u8* cur = symbolBase + job.cur;
329	u8* end = symbolBase + job.end;
330	u8* out = codeBase + job.out;
331	while (cur < end) {
332	u64 word = fsst_unaligned_load(V: cur);
333	size_t code = symbolTable.shortCodes[word & `0xFFFF`];
334	size_t pos = word & `0xFFFFFF`;
335	size_t idx = FSST_HASH(pos)&(symbolTable.hashTabSize-`1`);
336	Symbol s = symbolTable.hashTab[idx];
337	out[`1`] = (u8) word; // speculatively write out escaped byte
338	word &= (`0xFFFFFFFFFFFFFFFF` >> (u8) s.icl);
339	if ((s.icl < FSST_ICL_FREE) && s.val.num == word) {
340	*out++ = (u8) s.code(); cur += s.length();
341	} else {
342	// could be a 2-byte or 1-byte code, or miss
343	// handle everything with predication
344	*out = (u8) code;
345	out += `1`+((code&FSST_CODE_BASE)>>`8`);
346	cur += (code>>FSST_LEN_BITS);
347	}
348	}
349	job.out = out - codeBase;
350	}
351	// postprocess job info
352	job.cur = `0`;
353	job.end = job.out - input[job.pos].out; // misuse .end field as compressed size
354	job.out = input[job.pos].out; // reset offset to start of encoded string
355	input[job.pos] = job;
356	}
357
358	// copy out the result data
359	for(size_t i=`0`; i<batchPos; i++) {
360	size_t lineNr = jobLine[i]; // the sort must be order-preserving, as we concatenate results string in order
361	size_t sz = input[i].end; // had stored compressed lengths here
362	if (!strOut[lineNr]) strOut[lineNr] = dst; // first segment will be the strOut pointer
363	lenOut[lineNr] += sz; // add segment (lenOut starts at 0 for this reason)
364	memcpy(dest: dst, src: codeBase+input[i].out, n: sz);
365	dst += sz;
366	}
367
368	// go for the next batch of 512 chunks
369	inOff = outOff = batchPos = empty = `0`;
370	budget = (size_t) (lim - dst);
371	}
372	} while (curLine == prevLine && outOff <= (`1`<<`19`));
373	}
374	return curLine;
375	}
376
377
378	// optimized adaptive scalar* compression method*
379	static inline size_t compressBulk(SymbolTable &symbolTable, size_t nlines, size_t lenIn[], u8* strIn[], size_t size, u8* out, size_t lenOut[], u8* strOut[], bool noSuffixOpt, bool avoidBranch) {
380	u8 cur = NULL, end = NULL, *lim = out + size;
381	size_t curLine, suffixLim = symbolTable.suffixLim;
382	u8 byteLim = symbolTable.nSymbols + symbolTable.zeroTerminated - symbolTable.lenHisto[`0`];
383
384	u8 buf[`512`+`7`] = {}; / +7 sentinel is to avoid 8-byte unaligned-loads going beyond 511 out-of-bounds /
385
386	// three variants are possible. dead code falls away since the bool arguments are constants
387	auto compressVariant = [&](bool noSuffixOpt, bool avoidBranch) {
388	while (cur < end) {
389	u64 word = fsst_unaligned_load(V: cur);
390	size_t code = symbolTable.shortCodes[word & `0xFFFF`];
391	if (noSuffixOpt && ((u8) code) < suffixLim) {
392	// 2 byte code without having to worry about longer matches
393	*out++ = (u8) code; cur += `2`;
394	} else {
395	size_t pos = word & `0xFFFFFF`;
396	size_t idx = FSST_HASH(pos)&(symbolTable.hashTabSize-`1`);
397	Symbol s = symbolTable.hashTab[idx];
398	out[`1`] = (u8) word; // speculatively write out escaped byte
399	word &= (`0xFFFFFFFFFFFFFFFF` >> (u8) s.icl);
400	if ((s.icl < FSST_ICL_FREE) && s.val.num == word) {
401	*out++ = (u8) s.code(); cur += s.length();
402	} else if (avoidBranch) {
403	// could be a 2-byte or 1-byte code, or miss
404	// handle everything with predication
405	*out = (u8) code;
406	out += `1`+((code&FSST_CODE_BASE)>>`8`);
407	cur += (code>>FSST_LEN_BITS);
408	} else if ((u8) code < byteLim) {
409	// 2 byte code after checking there is no longer pattern
410	*out++ = (u8) code; cur += `2`;
411	} else {
412	// 1 byte code or miss.
413	*out = (u8) code;
414	out += `1`+((code&FSST_CODE_BASE)>>`8`); // predicated - tested with a branch, that was always worse
415	cur++;
416	}
417	}
418	}
419	};
420
421	for(curLine=`0`; curLine<nlines; curLine++) {
422	size_t chunk, curOff = `0`;
423	strOut[curLine] = out;
424	do {
425	cur = strIn[curLine] + curOff;
426	chunk = lenIn[curLine] - curOff;
427	if (chunk > `511`) {
428	chunk = `511`; // we need to compress in chunks of 511 in order to be byte-compatible with simd-compressed FSST
429	}
430	if ((`2`*chunk+`7`) > (size_t) (lim-out)) {
431	return curLine; // out of memory
432	}
433	// copy the string to the 511-byte buffer
434	memcpy(dest: buf, src: cur, n: chunk);
435	buf[chunk] = (u8) symbolTable.terminator;
436	cur = buf;
437	end = cur + chunk;
438
439	// based on symboltable stats, choose a variant that is nice to the branch predictor
440	if (noSuffixOpt) {
441	compressVariant(true,false);
442	} else if (avoidBranch) {
443	compressVariant(false,true);
444	} else {
445	compressVariant(false, false);
446	}
447	} while((curOff += chunk) < lenIn[curLine]);
448	lenOut[curLine] = (size_t) (out - strOut[curLine]);
449	}
450	return curLine;
451	}
452
453	#define FSST_SAMPLELINE ((size_t) 512)
454
455	// quickly select a uniformly random set of lines such that we have between [FSST_SAMPLETARGET,FSST_SAMPLEMAXSZ) string bytes
456	vector<u8> makeSample(u8 sampleBuf, u8* strIn[], size_t *lenIn, size_t nlines,
457	unique_ptr<vector<size_t>>& sample_len_out) {
458	size_t totSize = `0`;
459	vector<u8*> sample;
460
461	for(size_t i=`0`; i<nlines; i++)
462	totSize += lenIn[i];
463	if (totSize < FSST_SAMPLETARGET) {
464	for(size_t i=`0`; i<nlines; i++)
465	sample.push_back(x: strIn[i]);
466	} else {
467	size_t sampleRnd = FSST_HASH(`4637947`);
468	u8* sampleLim = sampleBuf + FSST_SAMPLETARGET;
469
470	sample_len_out = unique_ptr<vector<size_t>>(new vector<size_t>());
471	sample_len_out ->reserve(n: nlines + FSST_SAMPLEMAXSZ/FSST_SAMPLELINE);
472
473	// This fails if we have a lot of small strings and a few big ones?
474	while(sampleBuf < sampleLim) {
475	// choose a non-empty line
476	sampleRnd = FSST_HASH(sampleRnd);
477	size_t linenr = sampleRnd % nlines;
478	while (lenIn[linenr] == `0`)
479	if (++linenr == nlines) linenr = `0`;
480
481	// choose a chunk
482	size_t chunks = `1` + ((lenIn[linenr]-`1`) / FSST_SAMPLELINE);
483	sampleRnd = FSST_HASH(sampleRnd);
484	size_t chunk = FSST_SAMPLELINE*(sampleRnd % chunks);
485
486	// add the chunk to the sample
487	size_t len = min(lenIn[linenr]-chunk,FSST_SAMPLELINE);
488	memcpy(dest: sampleBuf, src: strIn[linenr]+chunk, n: len);
489	sample.push_back(x: sampleBuf);
490
491	sample_len_out ->push_back(x: len);
492	sampleBuf += len;
493	}
494	}
495	return sample;
496	}
497
498	extern "C" duckdb_fsst_encoder_t* duckdb_fsst_create(size_t n, size_t lenIn[], u8 strIn[], int* zeroTerminated) {
499	u8* sampleBuf = new u8[FSST_SAMPLEMAXSZ];
500	unique_ptr<vector<size_t>> sample_sizes;
501	vector<u8> sample = makeSample(sampleBuf, strIn, lenIn, nlines: n?n:`1`, sample_len_out&: sample_sizes); // careful handling of input to get a right-size and representative sample*
502	Encoder encoder = new* Encoder ();
503	size_t* sampleLen = sample_sizes ? sample_sizes ->data() : &lenIn[`0`];
504	encoder->symbolTable = shared_ptr<SymbolTable>(buildSymbolTable(counters&: encoder->counters, line: sample, len: sampleLen, zeroTerminated));
505	delete[] sampleBuf;
506	return (duckdb_fsst_encoder_t*) encoder;
507	}
508
509	/ create another encoder instance, necessary to do multi-threaded encoding using the same symbol table /
510	extern "C" duckdb_fsst_encoder_t* duckdb_fsst_duplicate(duckdb_fsst_encoder_t *encoder) {
511	Encoder e = new* Encoder ();
512	e->symbolTable = ((Encoder)encoder)->symbolTable; // it is a shared_ptr*
513	return (duckdb_fsst_encoder_t*) e;
514	}
515
516	// export a symbol table in compact format.
517	extern "C" u32 duckdb_fsst_export(duckdb_fsst_encoder_t encoder, u8 buf) {
518	Encoder e = (Encoder) encoder;
519	// In ->version there is a versionnr, but we hide also suffixLim/terminator/nSymbols there.
520	// This is sufficient in principle to reconstruct* a duckdb_fsst_encoder_t from a duckdb_fsst_decoder_t*
521	// (such functionality could be useful to append compressed data to an existing block).
522	//
523	// However, the hash function in the encoder hash table is endian-sensitive, and given its
524	// 'lossy perfect' hashing scheme is unable* to contain other-endian-produced symbol tables.*
525	// Doing a endian-conversion during hashing will be slow and self-defeating.
526	//
527	// Overall, we could support reconstructing an encoder for incremental compression, but
528	// should enforce equal-endianness. Bit of a bummer. Not going there now.
529	//
530	// The version field is now there just for future-proofness, but not used yet
531
532	// version allows keeping track of fsst versions, track endianness, and encoder reconstruction
533	u64 version = (FSST_VERSION << `32`) \| // version is 24 bits, most significant byte is 0
534	(((u64) e->symbolTable ->suffixLim) << `24`) \|
535	(((u64) e->symbolTable ->terminator) << `16`) \|
536	(((u64) e->symbolTable ->nSymbols) << `8`) \|
537	FSST_ENDIAN_MARKER; // least significant byte is nonzero
538
539	/ do not assume unaligned reads here /
540	memcpy(dest: buf, src: &version, n: `8`);
541	buf[`8`] = e->symbolTable ->zeroTerminated;
542	for(u32 i=`0`; i<`8`; i++)
543	buf[`9`+i] = (u8) e->symbolTable ->lenHisto[i];
544	u32 pos = `17`;
545
546	// emit only the used bytes of the symbols
547	for(u32 i = e->symbolTable ->zeroTerminated; i < e->symbolTable ->nSymbols; i++)
548	for(u32 j = `0`; j < e->symbolTable ->symbols[i].length(); j++)
549	buf[pos++] = e->symbolTable ->symbols[i].val.str[j]; // serialize used symbol bytes
550
551	return pos; // length of what was serialized
552	}
553
554	#define FSST_CORRUPT 32774747032022883 /* 7-byte number in little endian containing "corrupt" */
555
556	extern "C" u32 duckdb_fsst_import(duckdb_fsst_decoder_t decoder, u8 buf) {
557	u64 version = `0`;
558	u32 code, pos = `17`;
559	u8 lenHisto[`8`];
560
561	// version field (first 8 bytes) is now there just for future-proofness, unused still (skipped)
562	memcpy(dest: &version, src: buf, n: `8`);
563	if ((version>>`32`) != FSST_VERSION) return `0`;
564	decoder->zeroTerminated = buf[`8`]&`1`;
565	memcpy(dest: lenHisto, src: buf+`9`, n: `8`);
566
567	// in case of zero-terminated, first symbol is "" (zero always, may be overwritten)
568	decoder->len[`0`] = `1`;
569	decoder->symbol[`0`] = `0`;
570
571	// we use lenHisto[0] as 1-byte symbol run length (at the end)
572	code = decoder->zeroTerminated;
573	if (decoder->zeroTerminated) lenHisto[`0`]--; // if zeroTerminated, then symbol "" aka 1-byte code=0, is not stored at the end
574
575	// now get all symbols from the buffer
576	for(u32 l=`1`; l<=`8`; l++) { / l = 1,2,3,4,5,6,7,8 /
577	for(u32 i=`0`; i < lenHisto[(l&`7`) / 1,2,3,4,5,6,7,0 /]; i++, code++) {
578	decoder->len[code] = (l&`7`)+`1`; / len = 2,3,4,5,6,7,8,1 /
579	decoder->symbol[code] = `0`;
580	for(u32 j=`0`; j<decoder->len[code]; j++)
581	((u8) &decoder->symbol[code])[j] = buf[pos++]; // note this enforces 'little endian' symbols*
582	}
583	}
584	if (decoder->zeroTerminated) lenHisto[`0`]++;
585
586	// fill unused symbols with text "corrupt". Gives a chance to detect corrupted code sequences (if there are unused symbols).
587	while(code<`255`) {
588	decoder->symbol[code] = FSST_CORRUPT;
589	decoder->len[code++] = `8`;
590	}
591	return pos;
592	}
593
594	// runtime check for simd
595	inline size_t _compressImpl(Encoder e, size_t nlines, size_t lenIn[], u8 strIn[], size_t size, u8 output, size_t lenOut, u8 strOut[], bool* noSuffixOpt, bool avoidBranch, int simd) {
596	#ifndef NONOPT_FSST
597	if (simd && duckdb_fsst_hasAVX512())
598	return compressSIMD(symbolTable&: *e->symbolTable, symbolBase: e->simdbuf, nlines, len: lenIn, line: strIn, size, dst: output, lenOut, strOut, unroll: simd);
599	#endif
600	(void) simd;
601	return compressBulk(symbolTable&: *e->symbolTable, nlines, lenIn, strIn, size, out: output, lenOut, strOut, noSuffixOpt, avoidBranch);
602	}
603	size_t compressImpl(Encoder e, size_t nlines, size_t lenIn[], u8 strIn[], size_t size, u8 output, size_t lenOut, u8 strOut[], bool* noSuffixOpt, bool avoidBranch, int simd) {
604	return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd);
605	}
606
607	// adaptive choosing of scalar compression method based on symbol length histogram
608	inline size_t _compressAuto(Encoder e, size_t nlines, size_t lenIn[], u8 strIn[], size_t size, u8 output, size_t lenOut, u8 strOut[], int* simd) {
609	bool avoidBranch = false, noSuffixOpt = false;
610	if (`100`e->symbolTable ->lenHisto[`1`] > `65`e->symbolTable ->nSymbols && `100`e->symbolTable ->suffixLim > `95`e->symbolTable ->lenHisto[`1`]) {
611	noSuffixOpt = true;
612	} else if ((e->symbolTable ->lenHisto[`0`] > `24` && e->symbolTable ->lenHisto[`0`] < `92`) &&
613	(e->symbolTable ->lenHisto[`0`] < `43` \|\| e->symbolTable ->lenHisto[`6`] + e->symbolTable ->lenHisto[`7`] < `29`) &&
614	(e->symbolTable ->lenHisto[`0`] < `72` \|\| e->symbolTable ->lenHisto[`2`] < `72`)) {
615	avoidBranch = true;
616	}
617	return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd);
618	}
619	size_t compressAuto(Encoder e, size_t nlines, size_t lenIn[], u8 strIn[], size_t size, u8 output, size_t lenOut, u8 strOut[], int* simd) {
620	return _compressAuto(e, nlines, lenIn, strIn, size, output, lenOut, strOut, simd);
621	}
622
623	// the main compression function (everything automatic)
624	extern "C" size_t duckdb_fsst_compress(duckdb_fsst_encoder_t encoder, size_t nlines, size_t lenIn[], u8 strIn[], size_t size, u8 output, size_t lenOut, u8 *strOut[]) {
625	// to be faster than scalar, simd needs 64 lines or more of length >=12; or fewer lines, but big ones (totLen > 32KB)
626	size_t totLen = accumulate(first: lenIn, last: lenIn+nlines, init: `0`);
627	int simd = totLen > nlines*`12` && (nlines > `64` \|\| totLen > (size_t) `1`<<`15`);
628	return _compressAuto(e: (Encoder) encoder, nlines, lenIn, strIn, size, output, lenOut, strOut, simd: `3`simd);
629	}
630
631	/ deallocate encoder /
632	extern "C" void duckdb_fsst_destroy(duckdb_fsst_encoder_t* encoder) {
633	Encoder e = (Encoder) encoder;
634	delete e;
635	}
636
637	/ very lazy implementation relying on export and import /
638	extern "C" duckdb_fsst_decoder_t duckdb_fsst_decoder(duckdb_fsst_encoder_t *encoder) {
639	u8 buf[sizeof(duckdb_fsst_decoder_t)];
640	u32 cnt1 = duckdb_fsst_export(encoder, buf);
641	duckdb_fsst_decoder_t decoder;
642	u32 cnt2 = duckdb_fsst_import(decoder: &decoder, buf);
643	assert(cnt1 == cnt2); (void) cnt1; (void) cnt2;
644	return decoder;
645	}

Browse the source code of Velox/build/_deps/duckdb-src/third_party/fsst/libfsst.cpp