SDSL 3.0.1
Succinct Data Structure Library
csa_sada.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
8#ifndef INCLUDED_SDSL_CSA_SADA
9#define INCLUDED_SDSL_CSA_SADA
10
11#include <algorithm>
12#include <cassert>
13#include <cstring> // for strlen
14#include <iomanip>
15#include <iostream>
16#include <iterator>
17
20#include <sdsl/enc_vector.hpp>
21#include <sdsl/int_vector.hpp>
22#include <sdsl/iterators.hpp>
24#include <sdsl/util.hpp>
25
26namespace sdsl
27{
28
30
41template <class t_enc_vec = enc_vector<>, // Vector type used to store the Psi-function
42 uint32_t t_dens = 32, // Sample density for suffix array (SA) values
43 uint32_t t_inv_dens = 64, // Sample density for inverse suffix array (ISA) values
44 class t_sa_sample_strat = sa_order_sa_sampling<>, // Policy class for the SA sampling.
45 class t_isa_sample_strat = isa_sampling<>, // Policy class for ISA sampling.
46 class t_alphabet_strat = byte_alphabet // Policy class for the representation of the alphabet.
47 >
49{
50 static_assert(is_enc_vec<t_enc_vec>::value, "First template argument has to be of type env_vector.");
51 static_assert(t_dens > 0, "Second template argument has to be greater then 0.");
52 static_assert(t_inv_dens > 0, "Third template argument has to be greater then 0.");
53 static_assert(std::is_same<typename sampling_tag<t_sa_sample_strat>::type, sa_sampling_tag>::value,
54 "Forth template argument has to be a suffix array sampling strategy.");
55 static_assert(std::is_same<typename sampling_tag<t_isa_sample_strat>::type, isa_sampling_tag>::value,
56 "Fifth template argument has to be a inverse suffix array sampling strategy.");
57 static_assert(is_alphabet<t_alphabet_strat>::value, "Sixth template argument has to be a alphabet strategy.");
58
59 friend class bwt_of_csa_psi<csa_sada>;
60
61 public:
62 enum
63 {
65 isa_sample_dens = t_inv_dens
66 };
67
68 typedef uint64_t value_type;
74 typedef const pointer const_pointer;
77 typedef ptrdiff_t difference_type;
78 typedef t_enc_vec enc_vector_type;
85 typedef typename t_sa_sample_strat::template type<csa_sada> sa_sample_type;
86 typedef typename t_isa_sample_strat::template type<csa_sada> isa_sample_type;
87 typedef t_alphabet_strat alphabet_type;
88 typedef typename alphabet_type::alphabet_category alphabet_category;
89 typedef typename alphabet_type::comp_char_type comp_char_type;
90 typedef typename alphabet_type::char_type char_type; // Note: This is the char type of the CSA not the WT!
91 typedef typename alphabet_type::string_type string_type;
93
96
97 friend class traverse_csa_psi<csa_sada, true>;
98 friend class traverse_csa_psi<csa_sada, false>;
99
100 static const uint32_t linear_decode_limit = 100000;
101
102 private:
103 enc_vector_type m_psi; // psi function
104 sa_sample_type m_sa_sample; // suffix array samples
105 isa_sample_type m_isa_sample; // inverse suffix array samples
106 alphabet_type m_alphabet; // alphabet component
107
108 mutable std::vector<uint64_t> m_psi_buf; // buffer for decoded psi values
109
110 void create_buffer()
111 {
112 if (enc_vector_type::sample_dens < linear_decode_limit)
113 {
114 m_psi_buf = std::vector<uint64_t>(enc_vector_type::sample_dens + 1);
115 }
116 }
117
118 public:
119 const typename alphabet_type::char2comp_type & char2comp = m_alphabet.char2comp;
120 const typename alphabet_type::comp2char_type & comp2char = m_alphabet.comp2char;
121 const typename alphabet_type::C_type & C = m_alphabet.C;
122 const typename alphabet_type::sigma_type & sigma = m_alphabet.sigma;
123 const psi_type & psi = m_psi;
124 const lf_type lf = lf_type(*this);
125 const bwt_type bwt = bwt_type(*this);
126 const isa_type isa = isa_type(*this);
127 const bwt_type L = bwt_type(*this);
129 const text_type text = text_type(*this);
130 const sa_sample_type & sa_sample = m_sa_sample;
131 const isa_sample_type & isa_sample = m_isa_sample;
132
134 csa_sada() { create_buffer(); }
137
139 csa_sada(const csa_sada & csa)
140 : m_psi(csa.m_psi)
141 , m_sa_sample(csa.m_sa_sample)
142 , m_isa_sample(csa.m_isa_sample)
143 , m_alphabet(csa.m_alphabet)
144 {
145 create_buffer();
146 m_isa_sample.set_vector(&m_sa_sample);
147 }
148
151 : m_psi(std::move(csa.m_psi))
152 , m_sa_sample(std::move(csa.m_sa_sample))
153 , m_isa_sample(std::move(csa.m_isa_sample))
154 , m_alphabet(std::move(csa.m_alphabet))
155 {
156 create_buffer();
157 m_isa_sample.set_vector(&m_sa_sample);
158 }
159
160 csa_sada(cache_config & config);
161
163
168 size_type size() const { return m_psi.size(); }
169
171
174 static size_type max_size() { return t_enc_vec::max_size(); }
175
177
180 bool empty() const { return m_psi.empty(); }
181
183
186 const_iterator begin() const { return const_iterator(this, 0); }
187
189
192 const_iterator end() const { return const_iterator(this, size()); }
193
195
201 inline value_type operator[](size_type i) const;
202
204
208 {
209 if (this != &csa)
210 {
211 csa_sada tmp(csa);
212 *this = std::move(tmp);
213 }
214 return *this;
215 }
216
218
222 {
223 if (this != &csa)
224 {
225 m_psi = std::move(csa.m_psi);
226 m_sa_sample = std::move(csa.m_sa_sample);
227 m_isa_sample = std::move(csa.m_isa_sample);
228 m_isa_sample.set_vector(&m_sa_sample);
229 m_alphabet = std::move(csa.m_alphabet);
230 m_psi_buf = std::move(csa.m_psi_buf);
231 }
232 return *this;
233 }
234
236 bool operator==(csa_sada const & other) const noexcept
237 {
238 return (m_psi == other.m_psi) && (m_sa_sample == other.m_sa_sample) && (m_isa_sample == other.m_isa_sample) &&
239 (m_alphabet == other.m_alphabet);
240 }
241
243 bool operator!=(csa_sada const & other) const noexcept { return !(*this == other); }
244
246
249 size_type serialize(std::ostream & out, structure_tree_node * v = nullptr, std::string name = "") const;
250
252
254 void load(std::istream & in);
255
256 template <typename archive_t>
257 void CEREAL_SAVE_FUNCTION_NAME(archive_t & ar) const;
258
259 template <typename archive_t>
260 void CEREAL_LOAD_FUNCTION_NAME(archive_t & ar);
261
262 uint32_t get_sample_dens() const { return t_dens; }
263
264 private:
265 // Calculates how many symbols c are in the prefix [0..i-1] of the BWT of the original text.
266 /*
267 * \param i The exclusive index of the prefix range [0..i-1], so \f$i\in [0..size()]\f$.
268 * \param c The symbol to count the occurrences in the prefix.
269 * \returns The number of occurrences of symbol c in the prefix [0..i-1] of the BWT.
270 * \par Time complexity
271 * \f$ \Order{\log n t_{\Psi}} \f$
272 */
273 size_type rank_bwt(size_type i, const char_type c) const
274 {
276 if (cc == 0 and c != 0) // character is not in the text => return 0
277 return 0;
278 if (i == 0) return 0;
279 assert(i <= size());
280
281 size_type lower_b, upper_b; // lower_b inclusive, upper_b exclusive
282
283 const size_type sd = m_psi.get_sample_dens();
284 size_type lower_sb = (C[cc] + sd - 1) / sd; // lower_sb inclusive
285 size_type upper_sb = (C[cc + 1] + sd - 1) / sd; // upper_sb exclusive
286 while (lower_sb + 1 < upper_sb)
287 {
288 size_type mid = (lower_sb + upper_sb) / 2;
289 if (m_psi.sample(mid) >= i)
290 upper_sb = mid;
291 else
292 lower_sb = mid;
293 }
294
295 if (lower_sb == upper_sb)
296 { // the interval was smaller than sd
297 lower_b = C[cc];
298 upper_b = C[cc + 1];
299 }
300 else if (lower_sb > (C[cc] + sd - 1) / sd)
301 { // main case
302 // TODO: don't use get_inter_sampled_values if t_dens is really
303 // large
304 lower_b = lower_sb * sd;
305 if (0 == m_psi_buf.size())
306 {
307 upper_b = std::min(upper_sb * sd, C[cc + 1]);
308 goto finish;
309 }
310 uint64_t * p = m_psi_buf.data();
311 // extract the psi values between two samples
312 m_psi.get_inter_sampled_values(lower_sb, p);
313 p = m_psi_buf.data();
314 uint64_t smpl = m_psi.sample(lower_sb);
315 // handle border cases
316 if (lower_b + m_psi.get_sample_dens() >= C[cc + 1])
317 m_psi_buf[C[cc + 1] - lower_b] = size() - smpl;
318 else
319 m_psi_buf[m_psi.get_sample_dens()] = size() - smpl;
320 // search the result linear
321 while ((*p++) + smpl < i)
322 ;
323
324 return p - 1 - m_psi_buf.data() + lower_b - C[cc];
325 }
326 else
327 { // lower_b == (m_C[cc]+sd-1)/sd and lower_sb < upper_sb
328 if (m_psi.sample(lower_sb) >= i)
329 {
330 lower_b = C[cc];
331 upper_b = lower_sb * sd + 1;
332 }
333 else
334 {
335 lower_b = lower_sb * sd;
336 upper_b = std::min(upper_sb * sd, C[cc + 1]);
337 }
338 }
339 finish:
340 // binary search the interval [C[cc]..C[cc+1]-1] for the result
341 // size_type lower_b = m_C[cc], upper_b = m_C[cc+1]; // lower_b inclusive, upper_b exclusive
342 while (lower_b + 1 < upper_b)
343 {
344 size_type mid = (lower_b + upper_b) / 2;
345 if (m_psi[mid] >= i)
346 upper_b = mid;
347 else
348 lower_b = mid;
349 }
350 if (lower_b > C[cc])
351 return lower_b - C[cc] + 1;
352 else
353 { // lower_b == m_C[cc]
354 return m_psi[lower_b] < i; // 1 if m_psi[lower_b]<i, 0 otherwise
355 }
356 }
357
358 // Calculates the position of the i-th c in the BWT of the original text.
359 /*
360 * \param i The i-th occurrence. \f$i\in [1..rank_bwt(size(),c)]\f$.
361 * \param c Symbol c.
362 * \returns The position of the i-th c in the BWT or size() if c does occur less then i times.
363 * \par Time complexity
364 * \f$ \Order{t_{\Psi}} \f$
365 */
366 size_type select_bwt(size_type i, const char_type c) const
367 {
368 assert(i > 0);
370 if (cc == 0 and c != 0) // character is not in the text => return 0
371 return size();
372 assert(cc != 255);
373 if (C[cc] + i - 1 < C[cc + 1]) { return m_psi[C[cc] + i - 1]; }
374 else
375 return size();
376 }
377};
378
379// == template functions ==
380
381template <class t_enc_vec,
382 uint32_t t_dens,
383 uint32_t t_inv_dens,
384 class t_sa_sample_strat,
385 class t_isa,
386 class t_alphabet_strat>
388{
389 create_buffer();
390 if (!cache_file_exists(key_bwt<alphabet_type::int_width>(), config)) { return; }
391 size_type n = 0;
392 {
394 cache_file_name(key_bwt<alphabet_type::int_width>(), config));
395 n = bwt_buf.size();
396 auto event = memory_monitor::event("construct csa-alpbabet");
397 m_alphabet = alphabet_type(bwt_buf, n);
398 }
399 {
400 auto event = memory_monitor::event("sample SA");
401 m_sa_sample = sa_sample_type(config);
402 }
403 {
404 auto event = memory_monitor::event("sample ISA");
405 isa_sample_type isa_s(config, &m_sa_sample);
406 util::swap_support(m_isa_sample, isa_s, &m_sa_sample, (const sa_sample_type *)nullptr);
407 }
408 // if ( config.delete_files ) {
409 // remove_from_cache<int_vector<>>(conf::KEY_SA, config);
410 // }
411
412 int_vector<> cnt_chr(sigma, 0, bits::hi(n) + 1);
413 for (typename alphabet_type::sigma_type i = 0; i < sigma; ++i) { cnt_chr[i] = C[i]; }
414 // calculate psi
415 {
416 auto event = memory_monitor::event("construct PSI");
418 cache_file_name(key_bwt<alphabet_type::int_width>(), config));
419 std::string psi_file = cache_file_name(conf::KEY_PSI, config);
420 auto psi = write_out_mapper<>::create(psi_file, n, bits::hi(n) + 1);
421 for (size_type i = 0; i < n; ++i) { psi[cnt_chr[char2comp[bwt_buf[i]]]++] = i; }
423 }
424 {
425 auto event = memory_monitor::event("encode PSI");
427 m_psi = t_enc_vec(psi_buf);
428 }
429}
430
431template <class t_enc_vec,
432 uint32_t t_dens,
433 uint32_t t_inv_dens,
434 class t_sa_sample_strat,
435 class t_isa,
436 class t_alphabet_strat>
438 size_type i) const -> value_type
439{
440 size_type off = 0;
441 while (!m_sa_sample.is_sampled(i))
442 { // while i mod t_dens != 0 (SA[i] is not sampled)
443 i = psi[i]; // go to the position where SA[i]+1 is located
444 ++off; // add 1 to the offset
445 }
446 value_type result = m_sa_sample[i];
447 if (result < off) { return m_psi.size() - (off - result); }
448 else
449 return result - off;
450}
451
452template <class t_enc_vec,
453 uint32_t t_dens,
454 uint32_t t_inv_dens,
455 class t_sa_sample_strat,
456 class t_isa,
457 class t_alphabet_strat>
459 std::ostream & out,
461 std::string name) const -> size_type
462{
463 structure_tree_node * child = structure_tree::add_child(v, name, util::class_name(*this));
464 size_type written_bytes = 0;
465 written_bytes += m_psi.serialize(out, child, "psi");
466 written_bytes += m_sa_sample.serialize(out, child, "sa_samples");
467 written_bytes += m_isa_sample.serialize(out, child, "isa_samples");
468 written_bytes += m_alphabet.serialize(out, child, "alphabet");
469 structure_tree::add_size(child, written_bytes);
470 return written_bytes;
471}
472
473template <class t_enc_vec,
474 uint32_t t_dens,
475 uint32_t t_inv_dens,
476 class t_sa_sample_strat,
477 class t_isa,
478 class t_alphabet_strat>
480{
481 m_psi.load(in);
482 m_sa_sample.load(in);
483 m_isa_sample.load(in, &m_sa_sample);
484 m_alphabet.load(in);
485}
486
487template <class t_enc_vec,
488 uint32_t t_dens,
489 uint32_t t_inv_dens,
490 class t_sa_sample_strat,
491 class t_isa,
492 class t_alphabet_strat>
493template <typename archive_t>
495 archive_t & ar) const
496{
497 ar(CEREAL_NVP(m_psi));
498 ar(CEREAL_NVP(m_sa_sample));
499 ar(CEREAL_NVP(m_isa_sample));
500 ar(CEREAL_NVP(m_alphabet));
501}
502
503template <class t_enc_vec,
504 uint32_t t_dens,
505 uint32_t t_inv_dens,
506 class t_sa_sample_strat,
507 class t_isa,
508 class t_alphabet_strat>
509template <typename archive_t>
511 archive_t & ar)
512{
513 ar(CEREAL_NVP(m_psi));
514 ar(CEREAL_NVP(m_sa_sample));
515 ar(CEREAL_NVP(m_isa_sample));
516 m_isa_sample.set_vector(&m_sa_sample);
517 ar(CEREAL_NVP(m_alphabet));
518}
519
520} // end namespace sdsl
521#endif
#define CEREAL_NVP(X)
Definition: cereal.hpp:31
A wrapper for the bwt of a compressed suffix array that is based on the function.
A class for the Compressed Suffix Array (CSA) proposed by Sadakane for practical implementation.
Definition: csa_sada.hpp:49
t_isa_sample_strat::template type< csa_sada > isa_sample_type
Definition: csa_sada.hpp:86
alphabet_type::string_type string_type
Definition: csa_sada.hpp:91
uint32_t get_sample_dens() const
Definition: csa_sada.hpp:262
alphabet_type::alphabet_category alphabet_category
Definition: csa_sada.hpp:88
csa_tag index_category
Definition: csa_sada.hpp:94
isa_of_csa_psi< csa_sada > isa_type
Definition: csa_sada.hpp:82
bwt_of_csa_psi< csa_sada > bwt_type
Definition: csa_sada.hpp:81
const alphabet_type::comp2char_type & comp2char
Definition: csa_sada.hpp:120
const first_row_type F
Definition: csa_sada.hpp:128
const bwt_type bwt
Definition: csa_sada.hpp:125
t_enc_vec enc_vector_type
Definition: csa_sada.hpp:78
const_iterator begin() const
Returns a const_iterator to the first element.
Definition: csa_sada.hpp:186
const_reference reference
Definition: csa_sada.hpp:72
const isa_type isa
Definition: csa_sada.hpp:126
csa_sada & operator=(csa_sada &&csa)
Assignment Move Operator.
Definition: csa_sada.hpp:221
csa_sada csa_type
Definition: csa_sada.hpp:92
t_sa_sample_strat::template type< csa_sada > sa_sample_type
Definition: csa_sada.hpp:85
const bwt_type L
Definition: csa_sada.hpp:127
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serialize to a stream.
Definition: csa_sada.hpp:458
bool operator!=(csa_sada const &other) const noexcept
Inequality operator.
Definition: csa_sada.hpp:243
alphabet_type::comp_char_type comp_char_type
Definition: csa_sada.hpp:89
bool empty() const
Returns if the data strucutre is empty.
Definition: csa_sada.hpp:180
static size_type max_size()
Returns the largest size that csa_sada can ever have.
Definition: csa_sada.hpp:174
static const uint32_t linear_decode_limit
Definition: csa_sada.hpp:100
csa_sada(csa_sada &&csa)
Move constructor.
Definition: csa_sada.hpp:150
value_type operator[](size_type i) const
[]-operator
Definition: csa_sada.hpp:437
const_reference * pointer
Definition: csa_sada.hpp:73
const_iterator iterator
Definition: csa_sada.hpp:70
bool operator==(csa_sada const &other) const noexcept
Equality operator.
Definition: csa_sada.hpp:236
uint64_t value_type
Definition: csa_sada.hpp:68
t_alphabet_strat alphabet_type
Definition: csa_sada.hpp:87
void load(std::istream &in)
Load from a stream.
Definition: csa_sada.hpp:479
const psi_type & psi
Definition: csa_sada.hpp:123
const lf_type lf
Definition: csa_sada.hpp:124
const isa_sample_type & isa_sample
Definition: csa_sada.hpp:131
~csa_sada()
Default Destructor.
Definition: csa_sada.hpp:136
text_of_csa< csa_sada > text_type
Definition: csa_sada.hpp:83
csa_sada(const csa_sada &csa)
Copy constructor.
Definition: csa_sada.hpp:139
void CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
Definition: csa_sada.hpp:494
first_row_of_csa< csa_sada > first_row_type
Definition: csa_sada.hpp:84
int_vector ::size_type size_type
Definition: csa_sada.hpp:75
size_type size() const
Number of elements in the .
Definition: csa_sada.hpp:168
alphabet_type::char_type char_type
Definition: csa_sada.hpp:90
ptrdiff_t difference_type
Definition: csa_sada.hpp:77
const pointer const_pointer
Definition: csa_sada.hpp:74
csa_sada & operator=(const csa_sada &csa)
Assignment Copy Operator.
Definition: csa_sada.hpp:207
random_access_const_iterator< csa_sada > const_iterator
Definition: csa_sada.hpp:69
enc_vector_type psi_type
Definition: csa_sada.hpp:79
const text_type text
Definition: csa_sada.hpp:129
const alphabet_type::C_type & C
Definition: csa_sada.hpp:121
const alphabet_type::char2comp_type & char2comp
Definition: csa_sada.hpp:119
const_iterator end() const
Returns a const_iterator to the element after the last element.
Definition: csa_sada.hpp:192
traverse_csa_psi< csa_sada, false > lf_type
Definition: csa_sada.hpp:80
const sa_sample_type & sa_sample
Definition: csa_sada.hpp:130
size_type csa_size_type
Definition: csa_sada.hpp:76
const value_type const_reference
Definition: csa_sada.hpp:71
const alphabet_type::sigma_type & sigma
Definition: csa_sada.hpp:122
psi_tag extract_category
Definition: csa_sada.hpp:95
void CEREAL_LOAD_FUNCTION_NAME(archive_t &ar)
Definition: csa_sada.hpp:510
csa_sada()
Default Constructor.
Definition: csa_sada.hpp:134
uint64_t size() const
Returns the number of elements currently stored.
A generic vector class for integers of width .
Definition: int_vector.hpp:216
static mm_event_proxy event(const std::string &name)
Generic iterator for a random access container.
Definition: iterators.hpp:24
static structure_tree_node * add_child(structure_tree_node *v, const std::string &name, const std::string &type)
static void add_size(structure_tree_node *v, uint64_t value)
static int_vector_mapper< t_width > create(const std::string &key, cache_config &config)
csa_alphabet_strategy.hpp includes different strategy classes for representing an alphabet of a CSA.
csa_sampling_strategy.hpp includes different strategy classes for suffix array sampling in the CSAs.
enc_vector.hpp contains the sdsl::enc_vector class.
int_vector.hpp contains the sdsl::int_vector class.
iterators.hpp contains an generic iterator for random access containers.
constexpr char KEY_PSI[]
Definition: config.hpp:43
Namespace for the succinct data structure library.
std::string cache_file_name(const std::string &key, const cache_config &config)
Returns the file name of the resource.
Definition: io.hpp:630
void register_cache_file(const std::string &key, cache_config &config)
Register the existing resource specified by the key to the cache.
Definition: io.hpp:656
bool cache_file_exists(const std::string &key, const cache_config &config)
Checks if the resource specified by the key exists in the cache.
Definition: io.hpp:672
static constexpr uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Definition: bits.hpp:651
Helper class for construction process.
Definition: config.hpp:67
suffix_array_helper.hpp contains some helper classes for CSTs
util.hpp contains some helper methods for int_vector and other stuff like demangle class names.