SDSL 3.0.1
Succinct Data Structure Library
qsufsort.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
4/* qsufsort.c
5 * Copyright 1999, N. Jesper Larsson, all rights reserved.
6 *
7 * This file contains an implementation of the algorithm presented in "Faster
8 * Suffix Sorting" by N. Jesper Larsson (jesper@cs.lth.se) and Kunihiko
9 * Sadakane (sada@is.s.u-tokyo.ac.jp).
10 *
11 * This software may be used freely for any purpose. However, when distributed,
12 * the original source must be clearly stated, and, when the source code is
13 * distributed, the copyright notice must be retained and any alterations in
14 * the code must be clearly marked. No warranty is given regarding the quality
15 of this software.*/
23#ifndef INCLUDED_SDSL_QSUFSORT
24#define INCLUDED_SDSL_QSUFSORT
25
26#define DBG_OUT \
27 if (0) std::cout
28
29#include <sdsl/int_vector.hpp>
30
31namespace sdsl
32{
33namespace qsufsort
34{
35
36template <class int_vector_type = int_vector<>>
37class sorter;
38
39// void sort(int_iter text, int_iter sa, int64_t n, int64_t k, int64_t l);
40
42
58// TODO: problem when int_width==64!!!
59template <class int_vector_type>
60void construct_sa(int_vector_type & sa, const char * file, uint8_t num_bytes)
61{
63 s.sort(sa, file, num_bytes);
64}
65
66template <class int_vector_type, class t_vec>
67void construct_sa(int_vector_type & sa, t_vec & text)
68{
70 s.sort(sa, text);
71}
72
73template <class int_vector_type>
74class sorter
75{
76 typedef int_vector_type tIV;
77 typedef typename tIV::iterator int_iter;
78 typedef typename tIV::size_type size_type;
79
80 private:
81 int_iter m_SA, // group array, ultimately suffix array.
82 m_VV; // inverse array, ultimately inverse of SA.
83 uint64_t m_rr, // number of symbols aggregated by transform.
84 m_hh; // length of already-sorted prefixes.
85 uint8_t m_msb; // most significant bit position starting from 0
86 uint64_t m_msb_mask; // mask for 1ULL<<msb
87
88 inline int64_t to_sign(uint64_t x) const { return x & m_msb_mask ? -((int64_t)(x & ~m_msb_mask)) : x; }
89 // return the absolute value of integer x
90 inline int64_t mark_pos(uint64_t x) const { return (x & ~m_msb_mask); }
91 // mark the number x as negative
92 inline int64_t mark_neg(uint64_t x) const { return x | m_msb_mask; }
93 // check if x is not negative
94 inline bool not_neg(uint64_t x) const { return !(x >> m_msb); }
95 // check if x is negative
96 inline bool is_neg(uint64_t x) const { return x & m_msb_mask; }
97 // returns the key of iterator p at the current sorting depth
98 inline uint64_t key(const int_iter & p) const { return m_VV[*p + m_hh]; }
99 // swap the value of two iterators
100 inline void swap(int_iter & p, int_iter & q) const
101 {
102 uint64_t tmp = *p;
103 *p = *q;
104 *q = tmp;
105 }
106 // select the median out of 3 elements
107 inline const int_iter & med3(const int_iter & a, const int_iter & b, const int_iter & c) const
108 {
109 return key(a) < key(b) ? (key(b) < key(c) ? b : (key(a) < key(c) ? c : a))
110 : (key(b) > key(c) ? b : (key(a) > key(c) ? c : a));
111 }
112
113 /* Subroutine for select_sort_split and sort_split. Sets group numbers for a
114 group whose lowest position in m_SA is pl and highest position is pm.*/
115 void update_group(int_iter pl, int_iter pm)
116 {
117 int64_t g = pm - m_SA; /* group number.*/
118 m_VV[*pl] = g; /* update group number of first position.*/
119 if (pl == pm)
120 *pl = mark_neg(1); /* one element, sorted group.*/
121 else
122 do /* more than one element, unsorted group.*/
123 m_VV[*++pl] = g; /* update group numbers.*/
124 while (pl < pm);
125 }
126
127 /* Quadratic sorting method to use for small subarrays. To be able to update
128 group numbers consistently, a variant of selection sorting is used.*/
129 void select_sort_split(const int_iter & p, int64_t n)
130 {
131 int_iter pa, pb, pi, pn;
132 uint64_t f, v;
133
134 pa = p; /* pa is start of group being picked out.*/
135 pn = p + n - 1; /* pn is last position of subarray.*/
136 while (pa < pn)
137 {
138 for (pi = pb = (pa + 1), f = key(pa); pi <= pn; ++pi)
139 if ((v = key(pi)) < f)
140 {
141 f = v; /* f is smallest key found.*/
142 swap(pi, pa); /* place smallest element at beginning.*/
143 pb = pa + 1; /* pb is position for elements equal to f.*/
144 }
145 else if (v == f)
146 { /* if equal to smallest key.*/
147 swap(pi, pb); /* place next to other smallest elements.*/
148 ++pb;
149 }
150 update_group(pa, pb - 1); /* update group values for new group.*/
151 pa = pb; /* continue sorting rest of the subarray.*/
152 }
153 if (pa == pn)
154 { /* check if last part is single element.*/
155 m_VV[*pa] = pa - m_SA;
156 *pa = mark_neg(1); /* sorted group.*/
157 }
158 }
159
160 /* Subroutine for sort_split, algorithm by Bentley & McIlroy.*/
161 uint64_t choose_pivot(const int_iter & p, int64_t n)
162 {
163 int_iter pl, pm, pn;
164 int64_t s;
165
166 pm = p + (n >> 1); /* small arrays, middle element.*/
167 if (n > 7LL)
168 {
169 pl = p;
170 pn = p + n - 1;
171 if (n > 40LL)
172 { /* big arrays, pseudomedian of 9.*/
173 s = n >> 3;
174 pl = med3(pl, pl + s, pl + s + s);
175 pm = med3(pm - s, pm, pm + s);
176 pn = med3(pn - s - s, pn - s, pn);
177 }
178 pm = med3(pl, pm, pn); /* midsize arrays, median of 3.*/
179 }
180 return key(pm);
181 }
182
183 /* Sorting routine called for each unsorted group. Sorts the array of integers
184 * (suffix numbers) of length n starting at p. The algorithm is a ternary-split
185 * quicksort taken from Bentley & McIlroy, "Engineering a Sort Function",
186 * Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This
187 function is based on Program 7.*/
188 void sort_split(const int_iter & p, int64_t n)
189 {
190 int_iter pa, pb, pc, pd, pl, pm, pn;
191 uint64_t f, v;
192 int64_t s, t;
193
194 if (n < 7)
195 { /* multi-selection sort smallest arrays.*/
196 select_sort_split(p, n);
197 return;
198 }
199
200 v = choose_pivot(p, n);
201 pa = pb = p; // pa: iterator for equal elements
202 pc = pd = p + n - 1; // pc = right border (inclusive)
203 while (1)
204 { /* split-end partition.*/
205 while (pb <= pc && (f = key(pb)) <= v)
206 { // go to the right as long as there are keys smaller equal than v
207 if (f == v)
208 {
209 swap(pa, pb);
210 ++pa; // swap equal chars to the left
211 }
212 ++pb;
213 }
214 while (pc >= pb && (f = key(pc)) >= v)
215 { // go to the left as long as there are keys bigger or equal to v
216 if (f == v)
217 {
218 swap(pc, pd);
219 --pd; // swap equal chars to the right end
220 }
221 --pc;
222 }
223 if (pb > pc) break;
224 swap(pb,
225 pc); // swap element > v (pb) to the third part and element < v (pc) to the second
226 ++pb;
227 --pc;
228 }
229 pn = p + n;
230 if ((s = pa - p) > (t = pb - pa)) s = t;
231 for (pl = p, pm = pb - s; s; --s, ++pl, ++pm) swap(pl, pm);
232 if ((s = pd - pc) > (t = pn - pd - 1)) s = t;
233 for (pl = pb, pm = pn - s; s; --s, ++pl, ++pm) swap(pl, pm);
234 s = pb - pa;
235 t = pd - pc;
236 if (pa > pb)
237 {
238 if (s > 0) { std::cout << "s=" << s << ">0 but should be <0; n=" << n << std::endl; }
239 }
240 if (pc > pd)
241 {
242 if (t > 0) { std::cout << "t=" << t << ">0 but should be <0; n=" << n << std::endl; }
243 }
244 if (s > 0) sort_split(p, s);
245 update_group(p + s, p + n - t - 1);
246 if (t > 0) sort_split(p + n - t, t);
247 }
248
249 /* Bucketsort for first iteration.
250 *
251 * Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear
252 * at least once. x[n] is 0. (This is the corresponding output of transform.) k
253 * must be at most n+1. p is array of size n+1 whose contents are disregarded.
254 *
255 * Output: x is m_VV and p is m_SA after the initial sorting stage of the refined
256 suffix sorting algorithm.*/
257 void bucketsort(const int_iter & x, const int_iter & p, int64_t n, int64_t k)
258 {
259 int_iter pi;
260 int64_t i, d, g;
261 uint64_t c;
262
263 for (pi = p; pi < p + k; ++pi) *pi = mark_neg(1); /* mark linked lists empty.*/
264 for (i = 0; i <= n; ++i)
265 {
266 x[i] = p[c = x[i]]; /* insert in linked list.*/
267 p[c] = i;
268 }
269 for (pi = p + k - 1, i = n; pi >= p; --pi)
270 {
271 d = x[c = *pi]; /* c is position, d is next in list.*/
272 x[c] = g = i; /* last position equals group number.*/
273 if (not_neg(d))
274 { /* if more than one element in group.*/
275 p[i--] = c; /* p is permutation for the sorted x.*/
276 do {
277 d = x[c = d]; /* next in linked list.*/
278 x[c] = g; /* group number in x.*/
279 p[i--] = c; /* permutation in p.*/
280 } while (not_neg(d));
281 }
282 else
283 p[i--] = mark_neg(1); /* one element, sorted group.*/
284 }
285 }
286
287 public:
288 /* Transforms the alphabet of x by attempting to aggregate several symbols into
289 * one, while preserving the suffix order of x. The alphabet may also be
290 * compacted, so that x on output comprises all integers of the new alphabet
291 * with no skipped numbers.
292 *
293 * Input: x is an array of size n+1 whose first n elements are positive
294 * integers in the range l...k-1. p is array of size n+1, used for temporary
295 * storage. q controls aggregation and compaction by defining the maximum value
296 * for any symbol during transformation: q must be at least k-l; if q<=n,
297 * compaction is guaranteed; if k-l>n, compaction is never done; if q is
298 * INT_MAX, the maximum number of symbols are aggregated into one.
299 *
300 * Output: Returns an integer j in the range 1...q representing the size of the
301 * new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is
302 set to the number of old symbols grouped into one. Only x[n] is 0.*/
303
304 int64_t transform(const int_iter & x, const int_iter & p, int64_t n, int64_t k, int64_t l, int64_t q)
305 {
306 if (!(q >= k - l)) { std::cout << "q=" << q << " k-l=" << k - l << std::endl; }
307 assert(q >= k - l);
308 DBG_OUT << "transform(n=" << n << ", k=" << k << ", l=" << l << ", q=" << q << ")" << std::endl;
309 uint64_t bb, cc, dd;
310 int64_t jj;
311 int_iter pi, pj;
312 int s = bits::hi(k - l) + (k > l); /* s is number of bits in old symbol.*/
313 uint8_t len = 0; /* len is for overflow checking.*/
314 m_rr = 0;
315 for (bb = dd = 0; (int)m_rr < n && (int)len < m_msb + 1 - s && (int64_t)(cc = dd << s | (k - l)) <= q;
316 ++m_rr, len += s)
317 {
318 bb = bb << s | (x[m_rr] - l + 1); /* bb is start of x in chunk alphabet.*/
319 dd = cc; /* dd is max symbol in chunk alphabet.*/
320 }
321 DBG_OUT << "m_rr=" << m_rr << std::endl;
322 uint64_t mm = (1ULL << (m_rr - 1) * s) - 1; /* mm masks off top old symbol from chunk.*/
323 x[n] = l - 1; /* emulate zero terminator.*/
324 if ((int64_t)dd <= n)
325 { /* if bucketing possible, compact alphabet.*/
326 for (pi = p; pi <= p + dd; ++pi) *pi = 0; /* zero transformation table.*/
327 for (pi = x + m_rr, cc = bb; pi <= x + n; ++pi)
328 {
329 p[cc] = 1; /* mark used chunk symbol.*/
330 cc = (cc & mm) << s | (*pi - l + 1); /* shift in next old symbol in chunk.*/
331 }
332 for (uint64_t i = 1; i < m_rr; ++i)
333 { /* handle last r-1 positions.*/
334 p[cc] = 1; /* mark used chunk symbol.*/
335 cc = (cc & mm) << s; /* shift in next old symbol in chunk.*/
336 }
337 for (pi = p, jj = 1; pi <= p + dd; ++pi)
338 if (*pi) *pi = jj++; /* j is new alphabet size.*/
339 for (pi = x, pj = x + m_rr, cc = bb; pj <= x + n; ++pi, ++pj)
340 {
341 *pi = p[cc]; /* transform to new alphabet.*/
342 cc = (cc & mm) << s | (*pj - l + 1); /* shift in next old symbol in chunk.*/
343 }
344 while (pi < x + n)
345 { /* handle last r-1 positions.*/
346 *pi++ = p[cc]; /* transform to new alphabet.*/
347 cc = (cc & mm) << s; /* shift right-end zero in chunk.*/
348 }
349 }
350 else
351 { /* bucketing not possible, don't compact.*/
352 for (pi = x, pj = x + m_rr, cc = bb; pj <= x + n; ++pi, ++pj)
353 {
354 *pi = cc; /* transform to new alphabet.*/
355 cc = (cc & mm) << s | (*pj - l + 1); /* shift in next old symbol in chunk.*/
356 }
357 while (pi < x + n)
358 { /* handle last r-1 positions.*/
359 *pi++ = cc; /* transform to new alphabet.*/
360 cc = (cc & mm) << s; /* shift right-end zero in chunk.*/
361 }
362 jj = dd + 1; /* new alphabet size.*/
363 }
364 x[n] = 0; /* end-of-string symbol is zero.*/
365 DBG_OUT << "end transformation jj=" << jj << std::endl;
366 return jj; /* return new alphabet size.*/
367 }
368
369 /* Makes suffix array p of x. x becomes inverse of p. p and x are both of size
370 * n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original
371 * contents of x[n] is disregarded, the n-th symbol being regarded as
372 end-of-string smaller than all other symbols.*/
373 void sort(const int_iter & x, const int_iter & p, int64_t n, int64_t k, int64_t l)
374 {
375 int_iter pi, pk;
376 m_VV = x; /* set global values.*/
377 m_SA = p;
378 if (n >= k - l)
379 { /* if bucketing possible,*/
380 int64_t j = transform(m_VV, m_SA, n, k, l, n);
381 DBG_OUT << "begin bucketsort j=" << j << std::endl;
382 bucketsort(m_VV, m_SA, n, j); /* bucketsort on first r positions.*/
383 DBG_OUT << "end bucketsort" << std::endl;
384 }
385 else
386 {
387 transform(m_VV, m_SA, n, k, l, m_msb_mask - 1);
388 DBG_OUT << "initialize SA begin" << std::endl;
389 for (int64_t i = 0; i <= n; ++i) m_SA[i] = i; /* initialize I with suffix numbers.*/
390 DBG_OUT << "initialize SA end" << std::endl;
391 m_hh = 0;
392 sort_split(m_SA, n + 1); /* quicksort on first r positions.*/
393 }
394 m_hh = m_rr; /* number of symbols aggregated by transform.*/
395 // while ( is_neg(*m_SA) and mark_pos(*m_SA) <= n) {
396 while (to_sign(*m_SA) >= -n)
397 {
398 // std::cout<<"m_hh="<<m_hh<<std::endl;
399 DBG_OUT << "SA = ";
400 // for(size_t iii=0; iii<=(size_t)n; ++iii){
401 // uint64_t D = *(m_SA+iii);
402 // printf("%c%lld ", is_neg(D)?'-':' ', mark_pos(D));
403 //}
404 DBG_OUT << std::endl;
405 DBG_OUT << "TEXT = ";
406 // for(size_t iii=0; iii<=(size_t)n; ++iii){
407 // uint64_t D = *(m_VV+iii);
408 // printf("%c%lld ", is_neg(D)?'-':' ', mark_pos(D));
409 //}
410 DBG_OUT << std::endl;
411 DBG_OUT << "*m_SA=" << to_sign(*m_SA) << std::endl;
412 // std::cout<<"mark_pos(*m_SA)="<<mark_pos(*m_SA)<<std::endl;
413 pi = m_SA; /* pi is first position of group.*/
414 int64_t sl = 0; /* sl is length of sorted groups.*/
415 DBG_OUT << "m_hh=" << m_hh << std::endl;
416 do {
417 uint64_t s = *pi;
418 if (to_sign(s) < (int64_t)0)
419 {
420 pi += mark_pos(s); /* skip over sorted group.*/
421 sl += mark_pos(s); /* add length to sl.*/
422 }
423 else
424 {
425 if (sl)
426 {
427 *(pi - sl) = mark_neg(sl); /* combine sorted groups before pi.*/
428 sl = 0;
429 }
430 pk = m_SA + m_VV[s] + 1; /* pk-1 is last position of unsorted group.*/
431 sort_split(pi, pk - pi);
432 pi = pk; /* next group.*/
433 }
434 } while ((pi - m_SA) <= n);
435 if (sl) /* if the array ends with a sorted group.*/
436 *(pi - sl) = mark_neg(sl); /* combine sorted groups at end of m_SA.*/
437 m_hh = 2 * m_hh; /* double sorted-depth.*/
438 DBG_OUT << "m_hh=" << m_hh << std::endl;
439 }
440 for (int64_t i = 0; i <= n; ++i)
441 { /* reconstruct suffix array from inverse.*/
442 m_SA[m_VV[i]] = i;
443 }
444 }
445
446 void do_sort(tIV & sa, tIV & x)
447 {
448 assert(x.size() > 0);
449 DBG_OUT << "x.width()=" << (int)x.width() << std::endl;
450 DBG_OUT << "x.size()=" << x.size() << std::endl;
451 DBG_OUT << "sa.width()=" << (int)sa.width() << std::endl;
452 DBG_OUT << "sa.size()=" << sa.size() << std::endl;
453 if (x.size() == 1)
454 {
455 sa = tIV(1, 0);
456 return;
457 }
458
459 int64_t max_symbol = 0, min_symbol = x.width() < 64 ? bits::lo_set[x.width()] : 0x7FFFFFFFFFFFFFFFLL;
460
461 for (size_type i = 0; i < x.size() - 1; ++i)
462 {
463 max_symbol = std::max(max_symbol, (int64_t)x[i]);
464 min_symbol = std::min(min_symbol, (int64_t)x[i]);
465 }
466
467 if (0 == min_symbol) { throw std::logic_error("Text contains 0-symbol. Suffix array can not be constructed."); }
468 if (x[x.size() - 1] > 0)
469 {
470 throw std::logic_error("Last symbol is not 0-symbol. Suffix array can not be constructed.");
471 }
472 DBG_OUT << "sorter: min_symbol=" << min_symbol << std::endl;
473 DBG_OUT << "sorter: max_symbol=" << max_symbol << std::endl;
474
475 int64_t n = x.size() - 1;
476 DBG_OUT << "x.size()-1=" << x.size() - 1 << " n=" << n << std::endl;
477 uint8_t width = std::max(bits::hi(max_symbol) + 2, bits::hi(n + 1) + 2);
478 DBG_OUT << "sorter: width=" << (int)width << " max_symbol_width=" << bits::hi(max_symbol) + 1
479 << " n_width=" << bits::hi(n) << std::endl;
480 util::expand_width(x, width);
481 sa = x;
482 if (sa.width() < x.width())
483 {
484 throw std::logic_error("Fixed size suffix array is to small for the specified text.");
485 return;
486 }
487
488 m_msb = sa.width() - 1;
489 m_msb_mask = 1ULL << m_msb;
490 DBG_OUT << "sorter: m_msb=" << (int)m_msb << " m_msb_mask=" << m_msb_mask << std::endl;
491 sort(x.begin(), sa.begin(), x.size() - 1, max_symbol + 1, min_symbol);
492 }
493
494 void sort(tIV & sa, const char * file_name, uint8_t num_bytes)
495 {
496 DBG_OUT << "sorter: sort(" << file_name << ")" << std::endl;
497 DBG_OUT << "sizeof(int_vector<>::difference_type)=" << sizeof(int_vector<>::difference_type) << std::endl;
498 util::clear(sa); // free space for sa
499 tIV x;
500 if (num_bytes == 0 and typeid(typename tIV::reference) == typeid(uint64_t))
501 {
502 DBG_OUT << "sorter: use int_vector<64>" << std::endl;
503 int_vector<> temp;
504 load_vector_from_file(temp, file_name, num_bytes);
505 x.resize(temp.size());
506 for (size_type i = 0; i < temp.size(); ++i) x[i] = temp[i];
507 }
508 else
509 {
510 load_vector_from_file(x, file_name, num_bytes);
512 }
513 do_sort(sa, x);
514 }
515
516 template <class t_vec>
517 void sort(tIV & sa, t_vec & text)
518 {
519 tIV x;
520 x.resize(text.size());
521 for (size_type i = 0; i < text.size(); ++i) x[i] = text[i];
522 do_sort(sa, x);
523 }
524};
525
526} // end namespace qsufsort
527
528} // end namespace sdsl
529
530#endif
A generic vector class for integers of width .
Definition: int_vector.hpp:216
size_type size() const noexcept
The number of elements in the int_vector.
void sort(const int_iter &x, const int_iter &p, int64_t n, int64_t k, int64_t l)
Definition: qsufsort.hpp:373
void do_sort(tIV &sa, tIV &x)
Definition: qsufsort.hpp:446
void sort(tIV &sa, t_vec &text)
Definition: qsufsort.hpp:517
void sort(tIV &sa, const char *file_name, uint8_t num_bytes)
Definition: qsufsort.hpp:494
int64_t transform(const int_iter &x, const int_iter &p, int64_t n, int64_t k, int64_t l, int64_t q)
Definition: qsufsort.hpp:304
int_vector.hpp contains the sdsl::int_vector class.
int_vector ::size_type size_type
void construct_sa(int_vector_type &sa, const char *file, uint8_t num_bytes)
Construct a suffix array for the sequence stored in a file.
Definition: qsufsort.hpp:60
void expand_width(t_int_vec &v, uint8_t new_width)
Expands the integer width to new_width >= v.width()
Definition: util.hpp:509
void bit_compress(t_int_vec &v)
Bit compress the int_vector.
Definition: util.hpp:484
Namespace for the succinct data structure library.
bool load_vector_from_file(t_int_vec &v, const std::string &file, uint8_t num_bytes=1, uint8_t max_int_width=64)
from disk.
Definition: io.hpp:187
#define DBG_OUT
Definition: qsufsort.hpp:26
static constexpr uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Definition: bits.hpp:651
static constexpr uint64_t lo_set[65]
lo_set[i] is a 64-bit word with the i least significant bits set and the high bits not set.
Definition: bits.hpp:187