SDSL 3.0.1
Succinct Data Structure Library
wt_epr.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
9#ifndef INCLUDED_SDSL_WT_EPR
10#define INCLUDED_SDSL_WT_EPR
11
12#include <tuple>
13#include <type_traits>
14#include <utility>
15#include <vector>
16
17#include <sdsl/int_vector.hpp>
19#include <sdsl/wt_helper.hpp>
20
22namespace sdsl
23{
24
31template <uint8_t alphabet_size, class rank_type = rank_support_int_v<alphabet_size>, class t_tree_strat = byte_tree<>>
32class wt_epr
33{
34 public:
35 typedef typename t_tree_strat::template type<wt_epr> tree_strat_type;
42 typedef byte_alphabet_tag /*typename tree_strat_type::alphabet_category*/ alphabet_category;
43 enum
44 {
45 lex_ordered = true
46 };
47
48 private:
50 static constexpr bool has_inblock_text = std::is_same<rank_type, rank_support_int_v<alphabet_size>>::value;
51
52 size_type m_size = 0;
53 size_type m_sigma = 0;
54 int_vector<> m_bv;
55 rank_type m_bv_rank;
56
58 template <bool has_inblock_text_>
59 auto construct_init_rank_select(int_vector<> intermediate_bitvector) -> std::enable_if_t<has_inblock_text_, void>
60 {
61 // The text is stored inside of the rank structure so we do not store it here.
62 m_bv_rank = rank_type{ &intermediate_bitvector }; // Create the rank support structure.
63 }
64
66 template <bool has_inblock_text_>
67 auto construct_init_rank_select(int_vector<> intermediate_bitvector) -> std::enable_if_t<!has_inblock_text_, void>
68 {
69 m_bv = std::move(intermediate_bitvector);
70 m_bv_rank = rank_type{ &m_bv }; // Create the rank support structure.
71 }
72
74 template <bool has_inblock_text_>
75 auto value_at(size_type const position) const -> std::enable_if_t<has_inblock_text_, value_type>
76 { // In the special epr rank implementation, the text is stored in the superblocks.
77 assert(position < size());
78 return m_bv_rank.value_at(position); // Extract the value from the rank support structure.
79 }
80
82 template <bool has_inblock_text_>
83 auto value_at(size_type const position) const -> std::enable_if_t<!has_inblock_text_, value_type>
84 {
85 assert(position < size());
86 return m_bv[position];
87 }
88
89 public:
90 const size_type & sigma = m_sigma;
91 const int_vector<> & bv = m_bv;
92
94 wt_epr() = default;
95
102 template <typename t_it>
103 wt_epr(t_it begin, t_it end)
104 : m_size(std::distance(begin, end))
105 {
106 if (0 == m_size) return;
107 // O(n + |\Sigma|\log|\Sigma|) algorithm for calculating node sizes
108 // TODO: C should also depend on the tree_strategy. C is just a mapping
109 // from a symbol to its frequency. So a map<uint64_t,uint64_t> could be
110 // used for integer alphabets...
111 std::vector<size_type> C;
112 // 1. Count occurrences of characters
114 // 2. Calculate effective alphabet size
116
117 // The text cannot have an alphabet larger than the required alphabet_size.
118 if (m_sigma > alphabet_size)
119 throw std::domain_error{ "The given text uses an alphabet that is larger than the explicitly given "
120 "alphabet size." };
121
122 // 4. Generate wavelet tree bit sequence m_bv
123 int_vector<> intermediate_bitvector{};
124 intermediate_bitvector.width(std::ceil(std::log2(m_sigma)));
125 intermediate_bitvector.resize(m_size);
126
127 std::copy(begin, end, intermediate_bitvector.begin());
128
129 // 5. Initialize rank and select data structures for m_bv
130 construct_init_rank_select<has_inblock_text>(std::move(intermediate_bitvector));
131 }
132
133 template <typename t_it>
134 wt_epr(t_it begin, t_it end, std::string)
135 : wt_epr(begin, end)
136 {}
137
139 wt_epr(const wt_epr & wt)
140 : m_size(wt.m_size)
141 , m_sigma(wt.m_sigma)
142 , m_bv(wt.m_bv)
143 , m_bv_rank(wt.m_bv_rank)
144 {
145 m_bv_rank.set_vector(&m_bv);
146 }
147
149 : m_size(wt.m_size)
150 , m_sigma(wt.m_sigma)
151 , m_bv(std::move(wt.m_bv))
152 , m_bv_rank(std::move(wt.m_bv_rank))
153 {
154 m_bv_rank.set_vector(&m_bv);
155 }
156
158 wt_epr & operator=(const wt_epr & wt)
159 {
160 if (this != &wt)
161 {
162 wt_epr tmp(wt); // re-use copy-constructor
163 *this = std::move(tmp); // re-use move-assignment
164 }
165 return *this;
166 }
167
170 {
171 if (this != &wt)
172 {
173 m_size = wt.m_size;
174 m_sigma = wt.m_sigma;
175 m_bv = std::move(wt.m_bv);
176 m_bv_rank = std::move(wt.m_bv_rank);
177 m_bv_rank.set_vector(&m_bv);
178 }
179 return *this;
180 }
181
183 size_type size() const { return m_size; }
184
186 bool empty() const { return m_size == 0; }
187
197 auto operator[](size_type const i) const
198 {
199 assert(i < size());
200 return value_at<has_inblock_text>(i);
201 };
202
214 {
215 assert(i <= size());
216 return m_bv_rank.rank(i, c);
217 };
218
228 std::pair<size_type, value_type> inverse_select(size_type i) const
229 {
230 assert(i < size());
231 value_type value = (*this)[i];
232 return std::make_pair(m_bv_rank.rank(i, value), value);
233 }
234
235 // TODO: implement (if necessary?)
255 // void interval_symbols(size_type i,
256 // size_type j,
257 // size_type& k,
258 // std::vector<value_type>& cs,
259 // std::vector<size_type>& rank_c_i,
260 // std::vector<size_type>& rank_c_j) const
261 // { }
262
275 template <class t_ret_type = std::tuple<size_type, size_type, size_type>>
276 t_ret_type lex_count(size_type i, size_type j, value_type c) const
277 {
278 assert(i <= j and j <= size());
279 // size_type smaller = 0;
280 // size_type greater = (j - i) - (m_bv_rank.prefix_rank(j, c) - m_bv_rank.prefix_rank(i, c));
281 // if (c > 0)
282 // smaller = m_bv_rank.prefix_rank(j, c-1) - m_bv_rank.prefix_rank(i, c-1);
283 // size_type rank = m_bv_rank.rank(i, c);
284
285 // TODO: write a function returning a pair for (i, c) and (i, c-1) and benchmark!
286 size_type smaller = 0;
287 size_type prefix_i_c = m_bv_rank.prefix_rank(i, c);
288 size_type prefix_i_c_1 = 0;
289 size_type greater = j - i - m_bv_rank.prefix_rank(j, c) + prefix_i_c;
290 if (c > 0)
291 {
292 prefix_i_c_1 = m_bv_rank.prefix_rank(i, c - 1);
293 smaller = m_bv_rank.prefix_rank(j, c - 1) - prefix_i_c_1;
294 }
295 size_type rank = prefix_i_c - prefix_i_c_1;
296
297 return t_ret_type{ rank, smaller, greater };
298 }
299
300 /*\brief How many symbols are lexicographic smaller than c in [0..i-1].
301 * \param i Exclusive right bound of the range.
302 * \param c Symbol c.
303 * \return A tuple containing:
304 * * rank(i,c)
305 * * #symbols smaller than c in [0..i-1]
306 * \par Precondition
307 * \f$ i \leq size() \f$
308 * \note
309 * This method is only available if lex_ordered = true
310 */
311 template <class t_ret_type = std::tuple<size_type, size_type>>
313 {
314 assert(i <= size());
315 // TODO: write a function returning a pair for (i, c) and (i, c-1) and benchmark!
316 size_type prefix_count_smaller = 0;
317 if (c > 0) prefix_count_smaller = m_bv_rank.prefix_rank(i, c - 1);
318 return t_ret_type{ m_bv_rank.prefix_rank(i, c) - prefix_count_smaller, prefix_count_smaller };
319 }
320
322 const_iterator begin() const { return const_iterator(this, 0); }
323
325 const_iterator end() const { return const_iterator(this, size()); }
326
328 size_type serialize(std::ostream & out, structure_tree_node * v = nullptr, std::string name = "") const
329 {
330 structure_tree_node * child = structure_tree::add_child(v, name, util::class_name(*this));
331 size_type written_bytes = 0;
332 written_bytes += write_member(m_size, out, child, "size");
333 written_bytes += write_member(m_sigma, out, child, "sigma");
334 written_bytes += m_bv.serialize(out, child, "bv");
335 written_bytes += m_bv_rank.serialize(out, child, "bv_rank");
336 structure_tree::add_size(child, written_bytes);
337 return written_bytes;
338 }
339
341 void load(std::istream & in)
342 {
343 read_member(m_size, in);
344 read_member(m_sigma, in);
345 m_bv.load(in);
346 m_bv_rank.load(in, &m_bv);
347 }
348
350 friend bool operator==(wt_epr const & lhs, wt_epr const & rhs) noexcept
351 {
352 return (lhs.m_size == rhs.m_size) && (lhs.m_sigma == rhs.m_sigma) && (lhs.m_bv == rhs.m_bv) &&
353 (lhs.m_bv_rank == rhs.m_bv_rank);
354 }
355
357 friend bool operator!=(wt_epr const & lhs, wt_epr const & rhs) noexcept { return !(lhs == rhs); }
358
359 template <typename archive_t>
360 void CEREAL_SAVE_FUNCTION_NAME(archive_t & ar) const
361 {
362 ar(CEREAL_NVP(m_size));
363 ar(CEREAL_NVP(m_sigma));
364 ar(CEREAL_NVP(m_bv));
365 ar(CEREAL_NVP(m_bv_rank));
366 }
367
368 template <typename archive_t>
369 void CEREAL_LOAD_FUNCTION_NAME(archive_t & ar)
370 {
371 ar(CEREAL_NVP(m_size));
372 ar(CEREAL_NVP(m_sigma));
373 ar(CEREAL_NVP(m_bv));
374 ar(CEREAL_NVP(m_bv_rank));
375 m_bv_rank.set_vector(&m_bv);
376 }
377};
378} // namespace sdsl
379
380#endif
#define CEREAL_NVP(X)
Definition: cereal.hpp:31
A generic vector class for integers of width .
Definition: int_vector.hpp:216
void load(std::istream &in)
Load the int_vector for a stream.
uint8_t width() const noexcept
Returns the width of the integers which are accessed via the [] operator.
Definition: int_vector.hpp:595
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serializes the int_vector to a stream.
Generic iterator for a random access container.
Definition: iterators.hpp:24
static structure_tree_node * add_child(structure_tree_node *v, const std::string &name, const std::string &type)
static void add_size(structure_tree_node *v, uint64_t value)
An EPR-dictionary based wavelet.
Definition: wt_epr.hpp:33
int_vector ::difference_type difference_type
Definition: wt_epr.hpp:40
const_iterator begin() const
Returns a const_iterator to the first element.
Definition: wt_epr.hpp:322
bool empty() const
Returns whether the wavelet tree contains no data.
Definition: wt_epr.hpp:186
wt_epr(const wt_epr &wt)
Copy constructor.
Definition: wt_epr.hpp:139
auto operator[](size_type const i) const
Recovers the i-th symbol of the original vector.
Definition: wt_epr.hpp:197
wt_epr()=default
Default constructor.
int_vector ::size_type size_type
Definition: wt_epr.hpp:36
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serializes the data structure into the given ostream.
Definition: wt_epr.hpp:328
wt_tag index_category
Definition: wt_epr.hpp:41
const_iterator iterator
Definition: wt_epr.hpp:39
t_ret_type lex_smaller_count(size_type i, value_type c) const
Definition: wt_epr.hpp:312
size_type rank(size_type i, value_type c) const
Calculates how many symbols c are in the prefix [0..i-1].
Definition: wt_epr.hpp:213
friend bool operator!=(wt_epr const &lhs, wt_epr const &rhs) noexcept
Inequality operator.
Definition: wt_epr.hpp:357
const_iterator end() const
Returns a const_iterator to the element after the last element.
Definition: wt_epr.hpp:325
size_type size() const
Returns the size of the original vector.
Definition: wt_epr.hpp:183
std::pair< size_type, value_type > inverse_select(size_type i) const
Calculates how many times symbol wt[i] occurs in the prefix [0..i-1].
Definition: wt_epr.hpp:228
t_ret_type lex_count(size_type i, size_type j, value_type c) const
For each symbol c in wt[i..j-1] get rank(i,c) and rank(j,c).
Definition: wt_epr.hpp:276
wt_epr(t_it begin, t_it end)
Construct the EPR-dictionary from a sequence defined by two interators.
Definition: wt_epr.hpp:103
wt_epr & operator=(const wt_epr &wt)
Assignment operator.
Definition: wt_epr.hpp:158
wt_epr(wt_epr &&wt)
Definition: wt_epr.hpp:148
int_vector ::value_type value_type
Definition: wt_epr.hpp:37
random_access_const_iterator< wt_epr > const_iterator
Definition: wt_epr.hpp:38
wt_epr(t_it begin, t_it end, std::string)
Definition: wt_epr.hpp:134
byte_alphabet_tag alphabet_category
Definition: wt_epr.hpp:42
void load(std::istream &in)
Loads the data structure from the given istream.
Definition: wt_epr.hpp:341
const int_vector & bv
Definition: wt_epr.hpp:91
wt_epr & operator=(wt_epr &&wt)
Move assignment operator.
Definition: wt_epr.hpp:169
void CEREAL_LOAD_FUNCTION_NAME(archive_t &ar)
Definition: wt_epr.hpp:369
void CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
Definition: wt_epr.hpp:360
const size_type & sigma
Definition: wt_epr.hpp:90
t_tree_strat::template type< wt_epr > tree_strat_type
Definition: wt_epr.hpp:35
friend bool operator==(wt_epr const &lhs, wt_epr const &rhs) noexcept
Equality operator.
Definition: wt_epr.hpp:350
int_vector.hpp contains the sdsl::int_vector class.
Namespace for the succinct data structure library.
void calculate_effective_alphabet_size(const t_rac &C, sigma_type &sigma)
Definition: wt_helper.hpp:52
void calculate_character_occurences(t_it begin, t_it end, t_rac &C)
Count for each character the number of occurrences in rac[0..size-1].
Definition: wt_helper.hpp:40
size_t write_member(const T &t, std::ostream &out, sdsl::structure_tree_node *v=nullptr, std::string name="")
Definition: io.hpp:84
void read_member(T &t, std::istream &in)
Definition: io.hpp:111
rank_support_int.hpp contains classes that support a sdsl::int_vector with constant time rank informa...