23#ifndef INCLUDED_SDSL_QSUFSORT
24#define INCLUDED_SDSL_QSUFSORT
36template <
class int_vector_type =
int_vector<>>
59template <
class int_vector_type>
60void construct_sa(int_vector_type & sa,
const char * file, uint8_t num_bytes)
63 s.
sort(sa, file, num_bytes);
66template <
class int_vector_type,
class t_vec>
73template <
class int_vector_type>
76 typedef int_vector_type tIV;
77 typedef typename tIV::iterator int_iter;
88 inline int64_t to_sign(uint64_t x)
const {
return x & m_msb_mask ? -((int64_t)(x & ~m_msb_mask)) : x; }
90 inline int64_t mark_pos(uint64_t x)
const {
return (x & ~m_msb_mask); }
92 inline int64_t mark_neg(uint64_t x)
const {
return x | m_msb_mask; }
94 inline bool not_neg(uint64_t x)
const {
return !(x >> m_msb); }
96 inline bool is_neg(uint64_t x)
const {
return x & m_msb_mask; }
98 inline uint64_t key(
const int_iter & p)
const {
return m_VV[*p + m_hh]; }
100 inline void swap(int_iter & p, int_iter & q)
const
107 inline const int_iter & med3(
const int_iter & a,
const int_iter & b,
const int_iter & c)
const
109 return key(a) < key(b) ? (key(b) < key(c) ? b : (key(a) < key(c) ? c : a))
110 : (key(b) > key(c) ? b : (key(a) > key(c) ? c : a));
115 void update_group(int_iter pl, int_iter pm)
117 int64_t g = pm - m_SA;
129 void select_sort_split(
const int_iter & p, int64_t n)
131 int_iter pa, pb, pi, pn;
138 for (pi = pb = (pa + 1), f = key(pa); pi <= pn; ++pi)
139 if ((v = key(pi)) < f)
150 update_group(pa, pb - 1);
155 m_VV[*pa] = pa - m_SA;
161 uint64_t choose_pivot(
const int_iter & p, int64_t n)
174 pl = med3(pl, pl + s, pl + s + s);
175 pm = med3(pm - s, pm, pm + s);
176 pn = med3(pn - s - s, pn - s, pn);
178 pm = med3(pl, pm, pn);
188 void sort_split(
const int_iter & p, int64_t n)
190 int_iter pa, pb, pc, pd, pl, pm, pn;
196 select_sort_split(p, n);
200 v = choose_pivot(p, n);
205 while (pb <= pc && (f = key(pb)) <= v)
214 while (pc >= pb && (f = key(pc)) >= v)
230 if ((s = pa - p) > (t = pb - pa)) s = t;
231 for (pl = p, pm = pb - s; s; --s, ++pl, ++pm) swap(pl, pm);
232 if ((s = pd - pc) > (t = pn - pd - 1)) s = t;
233 for (pl = pb, pm = pn - s; s; --s, ++pl, ++pm) swap(pl, pm);
238 if (s > 0) { std::cout <<
"s=" << s <<
">0 but should be <0; n=" << n << std::endl; }
242 if (t > 0) { std::cout <<
"t=" << t <<
">0 but should be <0; n=" << n << std::endl; }
244 if (s > 0) sort_split(p, s);
245 update_group(p + s, p + n - t - 1);
246 if (t > 0) sort_split(p + n - t, t);
257 void bucketsort(
const int_iter & x,
const int_iter & p, int64_t n, int64_t k)
263 for (pi = p; pi < p + k; ++pi) *pi = mark_neg(1);
264 for (i = 0; i <= n; ++i)
269 for (pi = p + k - 1, i = n; pi >= p; --pi)
280 }
while (not_neg(d));
283 p[i--] = mark_neg(1);
304 int64_t
transform(
const int_iter & x,
const int_iter & p, int64_t n, int64_t k, int64_t l, int64_t q)
306 if (!(q >= k - l)) { std::cout <<
"q=" << q <<
" k-l=" << k - l << std::endl; }
308 DBG_OUT <<
"transform(n=" << n <<
", k=" << k <<
", l=" << l <<
", q=" << q <<
")" << std::endl;
315 for (bb = dd = 0; (int)m_rr < n && (
int)len < m_msb + 1 - s && (int64_t)(cc = dd << s | (k - l)) <= q;
318 bb = bb << s | (x[m_rr] - l + 1);
321 DBG_OUT <<
"m_rr=" << m_rr << std::endl;
322 uint64_t mm = (1ULL << (m_rr - 1) * s) - 1;
324 if ((int64_t)dd <= n)
326 for (pi = p; pi <= p + dd; ++pi) *pi = 0;
327 for (pi = x + m_rr, cc = bb; pi <= x + n; ++pi)
330 cc = (cc & mm) << s | (*pi - l + 1);
332 for (uint64_t i = 1; i < m_rr; ++i)
337 for (pi = p, jj = 1; pi <= p + dd; ++pi)
339 for (pi = x, pj = x + m_rr, cc = bb; pj <= x + n; ++pi, ++pj)
342 cc = (cc & mm) << s | (*pj - l + 1);
352 for (pi = x, pj = x + m_rr, cc = bb; pj <= x + n; ++pi, ++pj)
355 cc = (cc & mm) << s | (*pj - l + 1);
365 DBG_OUT <<
"end transformation jj=" << jj << std::endl;
373 void sort(
const int_iter & x,
const int_iter & p, int64_t n, int64_t k, int64_t l)
380 int64_t j =
transform(m_VV, m_SA, n, k, l, n);
381 DBG_OUT <<
"begin bucketsort j=" << j << std::endl;
382 bucketsort(m_VV, m_SA, n, j);
383 DBG_OUT <<
"end bucketsort" << std::endl;
387 transform(m_VV, m_SA, n, k, l, m_msb_mask - 1);
388 DBG_OUT <<
"initialize SA begin" << std::endl;
389 for (int64_t i = 0; i <= n; ++i) m_SA[i] = i;
390 DBG_OUT <<
"initialize SA end" << std::endl;
392 sort_split(m_SA, n + 1);
396 while (to_sign(*m_SA) >= -n)
411 DBG_OUT <<
"*m_SA=" << to_sign(*m_SA) << std::endl;
415 DBG_OUT <<
"m_hh=" << m_hh << std::endl;
418 if (to_sign(s) < (int64_t)0)
427 *(pi - sl) = mark_neg(sl);
430 pk = m_SA + m_VV[s] + 1;
431 sort_split(pi, pk - pi);
434 }
while ((pi - m_SA) <= n);
436 *(pi - sl) = mark_neg(sl);
438 DBG_OUT <<
"m_hh=" << m_hh << std::endl;
440 for (int64_t i = 0; i <= n; ++i)
448 assert(x.size() > 0);
449 DBG_OUT <<
"x.width()=" << (int)x.width() << std::endl;
450 DBG_OUT <<
"x.size()=" << x.size() << std::endl;
451 DBG_OUT <<
"sa.width()=" << (int)sa.width() << std::endl;
452 DBG_OUT <<
"sa.size()=" << sa.size() << std::endl;
459 int64_t max_symbol = 0, min_symbol = x.width() < 64 ?
bits::lo_set[x.width()] : 0x7FFFFFFFFFFFFFFFLL;
461 for (size_type i = 0; i < x.size() - 1; ++i)
463 max_symbol = std::max(max_symbol, (int64_t)x[i]);
464 min_symbol = std::min(min_symbol, (int64_t)x[i]);
467 if (0 == min_symbol) {
throw std::logic_error(
"Text contains 0-symbol. Suffix array can not be constructed."); }
468 if (x[x.size() - 1] > 0)
470 throw std::logic_error(
"Last symbol is not 0-symbol. Suffix array can not be constructed.");
472 DBG_OUT <<
"sorter: min_symbol=" << min_symbol << std::endl;
473 DBG_OUT <<
"sorter: max_symbol=" << max_symbol << std::endl;
475 int64_t n = x.size() - 1;
476 DBG_OUT <<
"x.size()-1=" << x.size() - 1 <<
" n=" << n << std::endl;
478 DBG_OUT <<
"sorter: width=" << (int)width <<
" max_symbol_width=" <<
bits::hi(max_symbol) + 1
479 <<
" n_width=" <<
bits::hi(n) << std::endl;
482 if (sa.width() < x.width())
484 throw std::logic_error(
"Fixed size suffix array is to small for the specified text.");
488 m_msb = sa.width() - 1;
489 m_msb_mask = 1ULL << m_msb;
490 DBG_OUT <<
"sorter: m_msb=" << (int)m_msb <<
" m_msb_mask=" << m_msb_mask << std::endl;
491 sort(x.begin(), sa.begin(), x.size() - 1, max_symbol + 1, min_symbol);
494 void sort(tIV & sa,
const char * file_name, uint8_t num_bytes)
496 DBG_OUT <<
"sorter: sort(" << file_name <<
")" << std::endl;
500 if (num_bytes == 0 and
typeid(
typename tIV::reference) ==
typeid(uint64_t))
502 DBG_OUT <<
"sorter: use int_vector<64>" << std::endl;
505 x.resize(temp.
size());
506 for (size_type i = 0; i < temp.
size(); ++i) x[i] = temp[i];
516 template <
class t_vec>
517 void sort(tIV & sa, t_vec & text)
520 x.resize(text.size());
521 for (size_type i = 0; i < text.size(); ++i) x[i] = text[i];
A generic vector class for integers of width .
size_type size() const noexcept
The number of elements in the int_vector.
void sort(const int_iter &x, const int_iter &p, int64_t n, int64_t k, int64_t l)
void do_sort(tIV &sa, tIV &x)
void sort(tIV &sa, t_vec &text)
void sort(tIV &sa, const char *file_name, uint8_t num_bytes)
int64_t transform(const int_iter &x, const int_iter &p, int64_t n, int64_t k, int64_t l, int64_t q)
int_vector.hpp contains the sdsl::int_vector class.
int_vector ::size_type size_type
void construct_sa(int_vector_type &sa, const char *file, uint8_t num_bytes)
Construct a suffix array for the sequence stored in a file.
void expand_width(t_int_vec &v, uint8_t new_width)
Expands the integer width to new_width >= v.width()
void bit_compress(t_int_vec &v)
Bit compress the int_vector.
Namespace for the succinct data structure library.
bool load_vector_from_file(t_int_vec &v, const std::string &file, uint8_t num_bytes=1, uint8_t max_int_width=64)
from disk.
static constexpr uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
static constexpr uint64_t lo_set[65]
lo_set[i] is a 64-bit word with the i least significant bits set and the high bits not set.