SDSL 3.0.1
Succinct Data Structure Library
wt_rlmn.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
9#ifndef INCLUDED_SDSL_WT_RLMN
10#define INCLUDED_SDSL_WT_RLMN
11
12#include <algorithm> // for std::swap
13#include <iostream>
14#include <queue>
15#include <stdexcept>
16#include <utility> // for pair
17#include <vector>
18
19#include <sdsl/int_vector.hpp>
20#include <sdsl/sd_vector.hpp> // for standard initialisation of template parameters
22#include <sdsl/util.hpp>
23#include <sdsl/wt_huff.hpp>
24
26namespace sdsl
27{
28
29template <class t_alphabet_cat>
31{
32 enum
33 {
34 width = 0
35 };
38
39 static std::map<uint64_t, uint64_t> temp_C() { return std::map<uint64_t, uint64_t>(); }
40
41 static C_type init_C(std::map<uint64_t, uint64_t> & C, uint64_t size)
42 {
43 uint64_t max_symbol = (--C.end())->first;
44 return C_type(max_symbol + 1, 0, bits::hi(size) + 1);
45 }
46
47 static C_bf_rank_type init_C_bf_rank(const C_type & C, uint64_t size)
48 {
49 return C_bf_rank_type(C.size(), 0, bits::hi(size) + 1);
50 }
51};
52
53template <>
55{
56 enum
57 {
58 width = 8
59 };
62
63 static int_vector<64> temp_C() { return int_vector<64>(256, 0); }
64
65 static C_type init_C(C_type & C, uint64_t) { return C; }
66
67 static C_bf_rank_type init_C_bf_rank(const C_type &, uint64_t) { return int_vector<64>(256, 0); }
68};
69
71
90template <class t_bitvector = sd_vector<>,
91 class t_rank = typename t_bitvector::rank_1_type,
92 class t_select = typename t_bitvector::select_1_type,
93 class t_wt = wt_huff<>>
95{
96 public:
97 typedef t_wt wt_type;
99 typedef typename t_wt::value_type value_type;
100 typedef typename t_bitvector::difference_type difference_type;
103 typedef t_bitvector bit_vector_type;
104 typedef t_rank rank_support_type;
105 typedef t_select select_support_type;
107 typedef typename t_wt::alphabet_category alphabet_category;
108 enum
109 {
110 lex_ordered = false
111 }; // TODO: is should be possible
112 enum
113 {
115 };
118 // to support all lex_ordered
119 // operations if t_wt::lex_ordered is
120 // true
121
122 private:
123 size_type m_size = 0; // size of the original input sequence
124 bit_vector_type m_bl; // bit vector for starts of runs in
125 // the BWT (or last column), i.e. _b_ _l_ast
126 bit_vector_type m_bf; // bit vector for starts of runs in
127 // the first column of the sorted suffixes, i.e _b_ _f_irst
128 wt_type m_wt; // wavelet tree for all levels
129 // two equal chars
130 rank_support_type m_bl_rank; // rank support for bit vector bl
131 rank_support_type m_bf_rank; // rank support for bit vector bf
132 select_support_type m_bl_select; // select support for bit vector bl
133 select_support_type m_bf_select; // select support for bit vector bf
134 C_type m_C; //
135 C_bf_rank_type m_C_bf_rank; // stores the number of 1s in m_bf for
136 // the prefixes m_bf[0..m_C[0]],m_bf[0..m_C[1]],....,m_bf[0..m_C[255]];
137 // named C_s in the original paper
138
139 public:
140 const size_type & sigma = m_wt.sigma;
141
142 // Default constructor
143 wt_rlmn() = default;
144
146
151 template <typename t_it>
152 wt_rlmn(t_it begin, t_it end, std::string tmp_dir = ram_file_name(""))
153 : m_size(std::distance(begin, end))
154 {
155 std::string temp_file = tmp_dir + +"_wt_rlmn_" + util::to_string(util::pid()) + "_" +
157 {
158 if (0 == m_size) return;
159 int_vector_buffer<width> condensed_wt(temp_file, std::ios::out);
160 // scope for bl and bf
161 bit_vector bl = bit_vector(m_size, 0);
162
164 value_type last_c = (value_type)0;
165 size_type j = 0;
166 for (auto it = begin; it != end; ++it, ++j)
167 {
168 value_type c = *it;
169 if (last_c != c or it == begin)
170 {
171 bl[j] = 1;
172 condensed_wt.push_back(c);
173 }
174 ++C[c];
175 last_c = c;
176 }
177 condensed_wt.close();
179
180 for (size_type i = 0, prefix_sum = 0; i < m_C.size(); ++i)
181 {
182 m_C[i] = prefix_sum;
183 prefix_sum += C[i];
184 }
185
186 C_type lf_map = m_C;
187 bit_vector bf = bit_vector(m_size + 1, 0);
188 bf[m_size] = 1; // initialize last element
189 j = 0;
190 for (auto it = begin; it != end; ++it, ++j)
191 {
192 value_type c = *it;
193 if (bl[j]) { bf[lf_map[c]] = 1; }
194 ++lf_map[c];
195 }
196 {
197 int_vector_buffer<width> temp_bwt_buf(temp_file);
198 m_wt = wt_type(temp_bwt_buf.begin(), temp_bwt_buf.end(), tmp_dir);
199 }
200 sdsl::remove(temp_file);
201 m_bl = bit_vector_type(std::move(bl));
202 m_bf = bit_vector_type(std::move(bf));
203 }
204
205 util::init_support(m_bl_rank, &m_bl);
206 util::init_support(m_bf_rank, &m_bf);
207 util::init_support(m_bf_select, &m_bf);
208 util::init_support(m_bl_select, &m_bl);
209
210 m_C_bf_rank = wt_rlmn_trait<alphabet_category>::init_C_bf_rank(m_C, m_size);
211 for (size_type i = 0; i < m_C.size(); ++i) { m_C_bf_rank[i] = m_bf_rank(m_C[i]); }
212 }
213
215 wt_rlmn(const wt_rlmn & wt)
216 : m_size(wt.m_size)
217 , m_bl(wt.m_bl)
218 , m_bf(wt.m_bf)
219 , m_wt(wt.m_wt)
220 , m_bl_rank(wt.m_bl_rank)
221 , m_bf_rank(wt.m_bf_rank)
222 , m_bl_select(wt.m_bl_select)
223 , m_bf_select(wt.m_bf_select)
224 , m_C(wt.m_C)
225 , m_C_bf_rank(wt.m_C_bf_rank)
226 {
227 m_bl_rank.set_vector(&m_bl);
228 m_bf_rank.set_vector(&m_bf);
229 m_bl_select.set_vector(&m_bl);
230 m_bf_select.set_vector(&m_bf);
231 }
232
235 : m_size(wt.m_size)
236 , m_bl(std::move(wt.m_bl))
237 , m_bf(std::move(wt.m_bf))
238 , m_wt(std::move(wt.m_wt))
239 , m_bl_rank(std::move(wt.m_bl_rank))
240 , m_bf_rank(std::move(wt.m_bf_rank))
241 , m_bl_select(std::move(wt.m_bl_select))
242 , m_bf_select(std::move(wt.m_bf_select))
243 , m_C(std::move(wt.m_C))
244 , m_C_bf_rank(std::move(wt.m_C_bf_rank))
245 {
246 m_bl_rank.set_vector(&m_bl);
247 m_bf_rank.set_vector(&m_bf);
248 m_bl_select.set_vector(&m_bl);
249 m_bf_select.set_vector(&m_bf);
250 }
251
254 {
255 if (this != &wt)
256 {
257 wt_rlmn tmp(wt);
258 *this = std::move(tmp);
259 }
260 return *this;
261 }
262
265 {
266 if (this != &wt)
267 {
268 m_size = std::move(wt.m_size);
269 m_bl = std::move(wt.m_bl);
270 m_bf = std::move(wt.m_bf);
271 m_wt = std::move(wt.m_wt);
272 m_bl_rank = std::move(wt.m_bl_rank);
273 m_bl_rank.set_vector(&m_bl);
274 m_bf_rank = std::move(wt.m_bf_rank);
275 m_bf_rank.set_vector(&m_bf);
276 m_bl_select = std::move(wt.m_bl_select);
277 m_bl_select.set_vector(&m_bl);
278 m_bf_select = std::move(wt.m_bf_select);
279 m_bf_select.set_vector(&m_bf);
280 m_C = std::move(wt.m_C);
281 m_C_bf_rank = std::move(wt.m_C_bf_rank);
282 }
283 return *this;
284 }
285
287 size_type size() const { return m_size; }
288
290 bool empty() const { return 0 == m_size; }
291
293
300 {
301 assert(i < size());
302 return m_wt[m_bl_rank(i + 1) - 1];
303 };
304
306
315 {
316 assert(i <= size());
317 if (i == 0) return 0;
318 size_type wt_ex_pos = m_bl_rank(i);
319 size_type c_runs = m_wt.rank(wt_ex_pos, c);
320 if (c_runs == 0) return 0;
321 if (m_wt[wt_ex_pos - 1] == c)
322 {
323 size_type c_run_begin = m_bl_select(wt_ex_pos);
324 return m_bf_select(m_C_bf_rank[c] + c_runs) - m_C[c] + i - c_run_begin;
325 }
326 else
327 {
328 return m_bf_select(m_C_bf_rank[c] + c_runs + 1) - m_C[c];
329 }
330 };
331
333
339 std::pair<size_type, value_type> inverse_select(size_type i) const
340 {
341 assert(i < size());
342 if (i == 0) { return std::make_pair(0, m_wt[0]); }
343 size_type wt_ex_pos = m_bl_rank(i + 1);
344 auto rc = m_wt.inverse_select(wt_ex_pos - 1);
345 size_type c_runs = rc.first + 1;
346 value_type c = rc.second;
347 if (c_runs == 0) return std::make_pair(0, c);
348 if (m_wt[wt_ex_pos - 1] == c)
349 {
350 size_type c_run_begin = m_bl_select(wt_ex_pos);
351 return std::make_pair(m_bf_select(m_C_bf_rank[c] + c_runs) - m_C[c] + i - c_run_begin, c);
352 }
353 else
354 {
355 return std::make_pair(m_bf_select(m_C_bf_rank[c] + c_runs + 1) - m_C[c], c);
356 }
357 }
358
360
368 {
369 assert(i > 0);
370 assert(i <= rank(size(), c));
371 size_type c_runs = m_bf_rank(m_C[c] + i) - m_C_bf_rank[c];
372 size_type offset = m_C[c] + i - 1 - m_bf_select(c_runs + m_C_bf_rank[c]);
373 return m_bl_select(m_wt.select(c_runs, c) + 1) + offset;
374 };
375
377 const_iterator begin() const { return const_iterator(this, 0); }
378
380 const_iterator end() const { return const_iterator(this, size()); }
381
383 size_type serialize(std::ostream & out, structure_tree_node * v = nullptr, std::string name = "") const
384 {
385 structure_tree_node * child = structure_tree::add_child(v, name, util::class_name(*this));
386 size_type written_bytes = 0;
387 written_bytes += write_member(m_size, out, child, "size");
388 written_bytes += m_bl.serialize(out, child, "bl");
389 written_bytes += m_bf.serialize(out, child, "bf");
390 written_bytes += m_wt.serialize(out, child, "wt");
391 written_bytes += m_bl_rank.serialize(out, child, "bl_rank");
392 written_bytes += m_bf_rank.serialize(out, child, "bf_rank");
393 written_bytes += m_bl_select.serialize(out, child, "bl_select");
394 written_bytes += m_bf_select.serialize(out, child, "bf_select");
395 written_bytes += m_C.serialize(out, child, "C");
396 written_bytes += m_C_bf_rank.serialize(out, child, "C_bf_rank");
397 structure_tree::add_size(child, written_bytes);
398 return written_bytes;
399 }
400
402 void load(std::istream & in)
403 {
404 read_member(m_size, in);
405 m_bl.load(in);
406 m_bf.load(in);
407 m_wt.load(in);
408 m_bl_rank.load(in, &m_bl);
409 m_bf_rank.load(in, &m_bf);
410 m_bl_select.load(in, &m_bl);
411 m_bf_select.load(in, &m_bf);
412 m_C.load(in);
413 m_C_bf_rank.load(in);
414 }
415
417 template <typename archive_t>
418 void CEREAL_SAVE_FUNCTION_NAME(archive_t & ar) const
419 {
420 ar(CEREAL_NVP(m_size));
421 ar(CEREAL_NVP(m_bl));
422 ar(CEREAL_NVP(m_bf));
423 ar(CEREAL_NVP(m_wt));
424 ar(CEREAL_NVP(m_bl_rank));
425 ar(CEREAL_NVP(m_bf_rank));
426 ar(CEREAL_NVP(m_bl_select));
427 ar(CEREAL_NVP(m_bf_select));
428 ar(CEREAL_NVP(m_C));
429 ar(CEREAL_NVP(m_C_bf_rank));
430 }
431
433 template <typename archive_t>
434 void CEREAL_LOAD_FUNCTION_NAME(archive_t & ar)
435 {
436 ar(CEREAL_NVP(m_size));
437 ar(CEREAL_NVP(m_bl));
438 ar(CEREAL_NVP(m_bf));
439 ar(CEREAL_NVP(m_wt));
440 ar(CEREAL_NVP(m_bl_rank));
441 m_bl_rank.set_vector(&m_bl);
442 ar(CEREAL_NVP(m_bf_rank));
443 m_bf_rank.set_vector(&m_bf);
444 ar(CEREAL_NVP(m_bl_select));
445 m_bl_select.set_vector(&m_bl);
446 ar(CEREAL_NVP(m_bf_select));
447 m_bf_select.set_vector(&m_bf);
448 ar(CEREAL_NVP(m_C));
449 ar(CEREAL_NVP(m_C_bf_rank));
450 }
451
453 bool operator==(wt_rlmn const & other) const noexcept
454 {
455 return (m_size == other.m_size) && (m_bl == other.m_bl) && (m_bf == other.m_bf) && (m_wt == other.m_wt) &&
456 (m_bl_rank == other.m_bl_rank) && (m_bf_rank == other.m_bf_rank) && (m_bl_select == other.m_bl_select) &&
457 (m_bf_select == other.m_bf_select) && (m_C == other.m_C) && (m_C_bf_rank == other.m_C_bf_rank);
458 }
459
461 bool operator!=(wt_rlmn const & other) const noexcept { return !(*this == other); }
462};
463
464} // end namespace sdsl
465#endif
#define CEREAL_NVP(X)
Definition: cereal.hpp:31
void close(bool remove_file=false)
Close the int_vector_buffer.
void push_back(const uint64_t value)
Appends the given element value to the end of the int_vector_buffer.
size_type size() const noexcept
The number of elements in the int_vector.
Generic iterator for a random access container.
Definition: iterators.hpp:24
static structure_tree_node * add_child(structure_tree_node *v, const std::string &name, const std::string &type)
static void add_size(structure_tree_node *v, uint64_t value)
A Wavelet Tree class for byte sequences.
Definition: wt_rlmn.hpp:95
t_bitvector bit_vector_type
Definition: wt_rlmn.hpp:103
wt_rlmn_trait< alphabet_category >::C_bf_rank_type C_bf_rank_type
Definition: wt_rlmn.hpp:117
t_wt::alphabet_category alphabet_category
Definition: wt_rlmn.hpp:107
wt_rlmn()=default
random_access_const_iterator< wt_rlmn > const_iterator
Definition: wt_rlmn.hpp:101
wt_rlmn_trait< alphabet_category >::C_type C_type
Definition: wt_rlmn.hpp:116
bool operator==(wt_rlmn const &other) const noexcept
Equality operator.
Definition: wt_rlmn.hpp:453
size_type rank(size_type i, value_type c) const
Calculates how many symbols c are in the prefix [0..i-1].
Definition: wt_rlmn.hpp:314
t_select select_support_type
Definition: wt_rlmn.hpp:105
size_type size() const
Returns the size of the original vector.
Definition: wt_rlmn.hpp:287
t_wt::value_type value_type
Definition: wt_rlmn.hpp:99
const size_type & sigma
Definition: wt_rlmn.hpp:140
void load(std::istream &in)
Loads the data structure from the given istream.
Definition: wt_rlmn.hpp:402
void CEREAL_LOAD_FUNCTION_NAME(archive_t &ar)
Load via cereal.
Definition: wt_rlmn.hpp:434
bool empty() const
Returns whether the wavelet tree contains no data.
Definition: wt_rlmn.hpp:290
wt_rlmn(wt_rlmn &&wt)
Move constructor.
Definition: wt_rlmn.hpp:234
value_type operator[](size_type i) const
Recovers the i-th symbol of the original vector.
Definition: wt_rlmn.hpp:299
wt_rlmn & operator=(const wt_rlmn &wt)
Assignment operator.
Definition: wt_rlmn.hpp:253
wt_rlmn(const wt_rlmn &wt)
Copy constructor.
Definition: wt_rlmn.hpp:215
bool operator!=(wt_rlmn const &other) const noexcept
Inequality operator.
Definition: wt_rlmn.hpp:461
t_wt wt_type
Definition: wt_rlmn.hpp:97
wt_tag index_category
Definition: wt_rlmn.hpp:106
wt_rlmn(t_it begin, t_it end, std::string tmp_dir=ram_file_name(""))
Construct the wavelet tree from a sequence defined by two interators.
Definition: wt_rlmn.hpp:152
int_vector ::size_type size_type
Definition: wt_rlmn.hpp:98
t_rank rank_support_type
Definition: wt_rlmn.hpp:104
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serializes the data structure into the given ostream.
Definition: wt_rlmn.hpp:383
const_iterator begin() const
Returns a const_iterator to the first element.
Definition: wt_rlmn.hpp:377
std::pair< size_type, value_type > inverse_select(size_type i) const
Calculates how many times symbol wt[i] occurs in the prefix [0..i-1].
Definition: wt_rlmn.hpp:339
t_bitvector::difference_type difference_type
Definition: wt_rlmn.hpp:100
wt_rlmn & operator=(wt_rlmn &&wt)
Assignment move operator.
Definition: wt_rlmn.hpp:264
const_iterator iterator
Definition: wt_rlmn.hpp:102
void CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
Serialise (save) via cereal.
Definition: wt_rlmn.hpp:418
const_iterator end() const
Returns a const_iterator to the element after the last element.
Definition: wt_rlmn.hpp:380
size_type select(size_type i, value_type c) const
Calculates the ith occurrence of the symbol c in the supported vector.
Definition: wt_rlmn.hpp:367
int_vector.hpp contains the sdsl::int_vector class.
uint64_t id()
uint64_t pid()
std::string to_string(const T &t, int w=1)
Namespace for the succinct data structure library.
std::string ram_file_name(const std::string &file)
Returns the corresponding RAM-file name for file.
Definition: ram_fs.hpp:174
size_t write_member(const T &t, std::ostream &out, sdsl::structure_tree_node *v=nullptr, std::string name="")
Definition: io.hpp:84
void read_member(T &t, std::istream &in)
Definition: io.hpp:111
int_vector< 1 > bit_vector
bit_vector is a specialization of the int_vector.
Definition: int_vector.hpp:53
int remove(const std::string &)
Remove a file.
Definition: ram_fs.hpp:194
int_vector ::size_type size(const range_type &r)
Size of a range.
Definition: wt_helper.hpp:787
sd_vector.hpp contains the sdsl::sd_vector class, and classes which support rank and select for sd_ve...
Contains declarations and definitions of data structure concepts.
static constexpr uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Definition: bits.hpp:651
static C_bf_rank_type init_C_bf_rank(const C_type &, uint64_t)
Definition: wt_rlmn.hpp:67
static C_type init_C(C_type &C, uint64_t)
Definition: wt_rlmn.hpp:65
static int_vector< 64 > temp_C()
Definition: wt_rlmn.hpp:63
static C_type init_C(std::map< uint64_t, uint64_t > &C, uint64_t size)
Definition: wt_rlmn.hpp:41
static C_bf_rank_type init_C_bf_rank(const C_type &C, uint64_t size)
Definition: wt_rlmn.hpp:47
int_vector C_bf_rank_type
Definition: wt_rlmn.hpp:37
static std::map< uint64_t, uint64_t > temp_C()
Definition: wt_rlmn.hpp:39
int_vector C_type
Definition: wt_rlmn.hpp:36
util.hpp contains some helper methods for int_vector and other stuff like demangle class names.
wt_huff.hpp contains a class for a Huffman shaped wavelet tree over byte sequences.