SDSL 3.0.1
Succinct Data Structure Library
wt_ap.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
9#ifndef INCLUDED_SDSL_WT_AP
10#define INCLUDED_SDSL_WT_AP
11
12#include <sdsl/bit_vectors.hpp>
13#include <sdsl/int_vector.hpp>
14#include <sdsl/vectors.hpp>
15#include <sdsl/wm_int.hpp>
16#include <sdsl/wt_huff.hpp>
17
19namespace sdsl
20{
21
23
36template <class t_wt_byte = wt_huff<bit_vector, rank_support_v5<>>, class t_wt_int = wm_int<>>
37class wt_ap
38{
39 static_assert(std::is_same<typename index_tag<t_wt_byte>::type, wt_tag>::value,
40 "First template argument has to be a wavelet tree.");
41 static_assert(std::is_same<typename index_tag<t_wt_int>::type, wt_tag>::value,
42 "Second template argument has to be a wavelet tree.");
43
44 public:
50 typedef t_wt_byte wt_byte_type;
51 typedef t_wt_int wt_int_type;
54 enum
55 {
56 lex_ordered = 0
57 };
58
59 protected:
61 value_type m_sigma = 0; //<- \f$ |\Sigma| \f$
66 std::vector<wt_int_type> m_offset;
67
68 private:
69 // retrieves a character's class and offset - if the character exists in the text
70 inline std::tuple<bool, value_type, value_type> try_get_char_class_offset(value_type c) const
71 {
72 if (c >= m_char2class.size())
73 { // c is greater than any symbol in text
74 return std::make_tuple(false, 0, 0);
75 }
76 auto offset_class = m_char2class.inverse_select(c);
77 if (offset_class.second == m_class_cnt)
78 { // c never occurs in text
79 return std::make_tuple(false, 0, 0);
80 }
81 return std::make_tuple(true, offset_class.second, offset_class.first);
82 }
83
84 public:
86
88 wt_ap() {}
89
91
96 template <typename t_it>
97 wt_ap(t_it begin, t_it end, std::string tmp_dir = ram_file_name(""))
98 : m_size(std::distance(begin, end))
99 {
100 const uint8_t wt_byte_width = wt_byte_type::alphabet_category::WIDTH;
101 const uint8_t wt_int_width = wt_int_type::alphabet_category::WIDTH;
102
103 // calculate effective sigma and character frequencies
104 value_type max_symbol = 0;
105 std::vector<std::pair<size_type, value_type>> char_freq;
106 value_type pseudo_entries = 0;
107 {
108 auto event = memory_monitor::event("char freq");
109 for (auto it = begin; it != end; ++it)
110 {
111 value_type element = *it;
112 while (element >= max_symbol)
113 {
114 char_freq.emplace_back(0, max_symbol);
115 max_symbol++;
116 pseudo_entries++;
117 }
118 if (char_freq[element].first == 0) { pseudo_entries--; }
119 char_freq[element].first++;
120 }
121 std::sort(char_freq.rbegin(), char_freq.rend());
122 m_sigma = max_symbol - pseudo_entries;
123 }
124
125 m_singleton_class_cnt = std::min(max_symbol, (value_type)bits::hi(m_sigma));
127
128 std::vector<std::pair<std::string, int_vector_buffer<wt_int_width>>> temp_file_offset_buffers;
129
130 // assign character classes
131 int_vector<wt_byte_width> m_char2class_buffer(max_symbol, m_class_cnt, bits::hi(m_class_cnt + 1) + 1);
132 for (value_type i = 0; i < m_singleton_class_cnt; ++i) { m_char2class_buffer[char_freq[i].second] = i; }
133 value_type current_symbol = m_singleton_class_cnt;
134 value_type class_size = 1;
135 {
136 auto event = memory_monitor::event("char2class");
138 {
139 class_size <<= 1;
140 value_type offset = 0;
141 for (; offset < class_size && current_symbol < m_sigma; ++offset, ++current_symbol)
142 {
143 m_char2class_buffer[char_freq[current_symbol].second] = i;
144 }
145
146 std::string temp_file_offset = tmp_dir + "_wt_ap_offset_" + util::to_string(i - m_singleton_class_cnt) +
148 temp_file_offset_buffers.emplace_back(temp_file_offset,
149 int_vector_buffer<wt_int_width>(temp_file_offset,
150 std::ios::out,
151 1024 * 1024,
152 bits::hi(offset) + 1));
153 }
154 char_freq.clear();
155 construct_im(m_char2class, m_char2class_buffer);
156 }
157
158 // calculate text-order classes and offsets
159 std::string temp_file_class = tmp_dir + "_wt_ap_class_" + util::to_string(util::pid()) + "_" +
161 int_vector_buffer<wt_byte_width> class_buffer(temp_file_class,
162 std::ios::out,
163 1024 * 1024,
164 bits::hi(m_class_cnt) + 1);
165 {
166 auto event = memory_monitor::event("write class and offset");
167 for (auto it = begin; it != end; ++it)
168 {
169 value_type ch = *it;
170 value_type cl = m_char2class_buffer[ch];
171 class_buffer.push_back(cl);
172 if (cl >= m_singleton_class_cnt)
173 {
174 value_type offset = m_char2class.rank(ch, cl);
176 temp_file_offset_buffers[cl].second.push_back(offset);
177 }
178 }
179 class_buffer.close();
180 }
181
182 {
183 auto event = memory_monitor::event("class WT");
184 int_vector_buffer<wt_byte_width> class_buffer(temp_file_class);
185 m_class = wt_byte_type(class_buffer.begin(), class_buffer.end(), tmp_dir);
186 }
187 sdsl::remove(temp_file_class);
188 {
189 auto event = memory_monitor::event("offset WTs");
191 for (value_type i = 0; i < m_class_cnt - m_singleton_class_cnt; ++i)
192 {
193 auto & temp_file_offset_buffer = temp_file_offset_buffers[i];
194 temp_file_offset_buffer.second.close();
195 {
196 int_vector_buffer<wt_int_width> offset_buffer(temp_file_offset_buffer.first);
197 m_offset[i] = wt_int_type(offset_buffer.begin(), offset_buffer.end(), tmp_dir);
198 }
199 sdsl::remove(temp_file_offset_buffer.first);
200 }
201 }
202 }
203
205 wt_ap(const wt_ap & wt)
206 : m_size(wt.m_size)
207 , m_sigma(wt.m_sigma)
211 , m_class(wt.m_class)
212 , m_offset(wt.m_offset)
213 {}
214
216 wt_ap(wt_ap && wt) { *this = std::move(wt); }
217
219 wt_ap & operator=(const wt_ap & wt)
220 {
221 if (this != &wt)
222 {
223 wt_ap tmp(wt);
224 *this = std::move(tmp);
225 }
226 return *this;
227 }
228
231 {
232 if (this != &wt)
233 {
234 m_size = wt.m_size;
235 m_sigma = wt.m_sigma;
236 m_singleton_class_cnt = wt.m_singleton_class_cnt;
237 m_class_cnt = wt.m_class_cnt;
238 m_char2class = std::move(wt.m_char2class);
239 m_class = std::move(wt.m_class);
240 m_offset = std::move(wt.m_offset);
241 }
242 return *this;
243 }
244
246 size_type size() const { return m_size; }
247
249 bool empty() const { return m_size == 0; }
250
252
262 {
263 assert(i < size());
264 auto textoffset_class = m_class.inverse_select(i);
265 auto cl = textoffset_class.second;
266 value_type offset = cl < m_singleton_class_cnt ? 0
267 : m_offset[cl - m_singleton_class_cnt][textoffset_class.first];
268 return m_char2class.select(offset + 1, cl);
269 };
270
272
284 {
285 assert(i <= size());
286 auto success_class_offset = try_get_char_class_offset(c);
287 if (!std::get<0>(success_class_offset)) { return 0; }
288 auto cl = std::get<1>(success_class_offset);
289 auto offset = std::get<2>(success_class_offset);
290 size_type count = m_class.rank(i, cl);
291 return cl < m_singleton_class_cnt ? count : m_offset[cl - m_singleton_class_cnt].rank(count, offset);
292 };
293
295
301 std::pair<size_type, value_type> inverse_select(size_type i) const
302 {
303 assert(i < size());
304
305 auto textoffset_class = m_class.inverse_select(i);
306 auto textoffset = textoffset_class.first;
307 auto cl = textoffset_class.second;
308 if (cl < m_singleton_class_cnt) { return std::make_pair(textoffset, m_char2class.select(1, cl)); }
309 auto class_result = m_offset[cl - m_singleton_class_cnt].inverse_select(textoffset);
310 return std::make_pair(class_result.first, m_char2class.select(class_result.second + 1, cl));
311 }
312
314
325 {
326 assert(1 <= i and i <= rank(size(), c));
327 auto success_class_offset = try_get_char_class_offset(c);
328 if (!std::get<0>(success_class_offset)) { return m_size; }
329 auto cl = std::get<1>(success_class_offset);
330 auto offset = std::get<2>(success_class_offset);
331 size_type text_offset = cl < m_singleton_class_cnt ? i
332 : 1 + m_offset[cl - m_singleton_class_cnt].select(i, offset);
333 return m_class.select(text_offset, cl);
334 };
335
337 size_type serialize(std::ostream & out, structure_tree_node * v = nullptr, std::string name = "") const
338 {
339 structure_tree_node * child = structure_tree::add_child(v, name, util::class_name(*this));
340 size_type written_bytes = 0;
341 written_bytes += write_member(m_size, out, child, "size");
342 written_bytes += write_member(m_sigma, out, child, "sigma");
343 written_bytes += write_member(m_singleton_class_cnt, out, child, "singleton_classes");
344 written_bytes += write_member(m_class_cnt, out, child, "classes");
345 written_bytes += m_char2class.serialize(out, child, "char2class");
346 written_bytes += m_class.serialize(out, child, "class");
347 for (value_type i = 0; i < m_offset.size(); ++i)
348 {
349 written_bytes += m_offset[i].serialize(out, child, "offset");
350 }
351 structure_tree::add_size(child, written_bytes);
352 return written_bytes;
353 }
354
356 void load(std::istream & in)
357 {
358 read_member(m_size, in);
359 read_member(m_sigma, in);
362 m_char2class.load(in);
363 m_class.load(in);
365 m_offset.resize(offset_size);
366 for (value_type i = 0; i < offset_size; ++i) { m_offset[i].load(in); }
367 }
368
370 template <typename archive_t>
371 void CEREAL_SAVE_FUNCTION_NAME(archive_t & ar) const
372 {
373 ar(CEREAL_NVP(m_size));
374 ar(CEREAL_NVP(m_sigma));
378 ar(CEREAL_NVP(m_class));
379 ar(CEREAL_NVP(m_offset));
380 }
381
383 template <typename archive_t>
384 void CEREAL_LOAD_FUNCTION_NAME(archive_t & ar)
385 {
386 ar(CEREAL_NVP(m_size));
387 ar(CEREAL_NVP(m_sigma));
391 ar(CEREAL_NVP(m_class));
392 ar(CEREAL_NVP(m_offset));
393 }
394
395 iterator begin() { return { this, 0 }; };
396 const_iterator end() { return { this, size() }; };
397 iterator begin() const { return { this, 0 }; };
398 const_iterator end() const { return { this, size() }; };
399
401 bool operator==(wt_ap const & other) const noexcept
402 {
403 return (m_size == other.m_size) && (m_sigma == other.m_sigma) &&
404 (m_singleton_class_cnt == other.m_singleton_class_cnt) && (m_class_cnt == other.m_class_cnt) &&
405 (m_char2class == other.m_char2class) && (m_class == other.m_class) && (m_offset == other.m_offset);
406 }
407
409 bool operator!=(wt_ap const & other) const noexcept { return !(*this == other); }
410};
411
412} // end namespace sdsl
413#endif
bit_vectors.hpp contains classes for uncompressed and compressed bit vector representations.
#define CEREAL_NVP(X)
Definition: cereal.hpp:31
void close(bool remove_file=false)
Close the int_vector_buffer.
void push_back(const uint64_t value)
Appends the given element value to the end of the int_vector_buffer.
A generic vector class for integers of width .
Definition: int_vector.hpp:216
static mm_event_proxy event(const std::string &name)
Generic iterator for a random access container.
Definition: iterators.hpp:24
static structure_tree_node * add_child(structure_tree_node *v, const std::string &name, const std::string &type)
static void add_size(structure_tree_node *v, uint64_t value)
A wavelet tree class for integer sequences.
Definition: wt_ap.hpp:38
std::vector< wt_int_type > m_offset
Definition: wt_ap.hpp:66
wt_byte_type m_class
Definition: wt_ap.hpp:65
t_wt_int wt_int_type
Definition: wt_ap.hpp:51
iterator begin()
Definition: wt_ap.hpp:395
const_iterator end() const
Definition: wt_ap.hpp:398
bool operator!=(wt_ap const &other) const noexcept
Inequality operator.
Definition: wt_ap.hpp:409
void CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
Serialise (save) via cereal.
Definition: wt_ap.hpp:371
bool operator==(wt_ap const &other) const noexcept
Equality operator.
Definition: wt_ap.hpp:401
wt_ap(const wt_ap &wt)
Copy constructor.
Definition: wt_ap.hpp:205
wt_tag index_category
Definition: wt_ap.hpp:52
size_type m_size
Definition: wt_ap.hpp:60
void CEREAL_LOAD_FUNCTION_NAME(archive_t &ar)
Load via cereal.
Definition: wt_ap.hpp:384
const size_type & sigma
Definition: wt_ap.hpp:85
wt_ap & operator=(wt_ap &&wt)
Assignment move operator.
Definition: wt_ap.hpp:230
wt_ap(wt_ap &&wt)
Move copy constructor.
Definition: wt_ap.hpp:216
const_iterator end()
Definition: wt_ap.hpp:396
size_type select(size_type i, value_type c) const
Calculates the i-th occurrence of the symbol c in the supported vector.
Definition: wt_ap.hpp:324
@ lex_ordered
Definition: wt_ap.hpp:56
wt_ap & operator=(const wt_ap &wt)
Assignment operator.
Definition: wt_ap.hpp:219
value_type m_singleton_class_cnt
Definition: wt_ap.hpp:62
wt_byte_type m_char2class
Definition: wt_ap.hpp:64
wt_ap(t_it begin, t_it end, std::string tmp_dir=ram_file_name(""))
Construct the wavelet tree from a sequence defined by two interators.
Definition: wt_ap.hpp:97
bool empty() const
Returns whether the wavelet tree contains no data.
Definition: wt_ap.hpp:249
size_type rank(size_type i, value_type c) const
Calculates how many symbols c are in the prefix [0..i-1] of the supported vector.
Definition: wt_ap.hpp:283
void load(std::istream &in)
Loads the data structure from the given istream.
Definition: wt_ap.hpp:356
int_vector ::difference_type difference_type
Definition: wt_ap.hpp:47
int_vector ::size_type size_type
Definition: wt_ap.hpp:45
iterator begin() const
Definition: wt_ap.hpp:397
value_type operator[](size_type i) const
Recovers the i-th symbol of the original vector.
Definition: wt_ap.hpp:261
int_alphabet_tag alphabet_category
Definition: wt_ap.hpp:53
wt_ap()
Default constructor.
Definition: wt_ap.hpp:88
t_wt_byte wt_byte_type
Definition: wt_ap.hpp:50
const_iterator iterator
Definition: wt_ap.hpp:49
value_type m_class_cnt
Definition: wt_ap.hpp:63
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serializes the data structure into the given ostream.
Definition: wt_ap.hpp:337
int_vector ::value_type value_type
Definition: wt_ap.hpp:46
random_access_const_iterator< wt_ap > const_iterator
Definition: wt_ap.hpp:48
value_type m_sigma
Definition: wt_ap.hpp:61
size_type size() const
Returns the size of the original vector.
Definition: wt_ap.hpp:246
std::pair< size_type, value_type > inverse_select(size_type i) const
Calculates how many occurrences of symbol wt[i] are in the prefix [0..i-1] of the original sequence.
Definition: wt_ap.hpp:301
int_vector.hpp contains the sdsl::int_vector class.
uint64_t id()
uint64_t pid()
std::string to_string(const T &t, int w=1)
Namespace for the succinct data structure library.
uint64_t count(const t_k2_treap &treap, k2_treap_ns::point_type p1, k2_treap_ns::point_type p2)
Count how many points are in the rectangle (p1,p2)
std::string ram_file_name(const std::string &file)
Returns the corresponding RAM-file name for file.
Definition: ram_fs.hpp:174
size_t write_member(const T &t, std::ostream &out, sdsl::structure_tree_node *v=nullptr, std::string name="")
Definition: io.hpp:84
void read_member(T &t, std::istream &in)
Definition: io.hpp:111
int remove(const std::string &)
Remove a file.
Definition: ram_fs.hpp:194
void construct_im(t_index &idx, t_data &&data, uint8_t num_bytes=0)
Definition: construct.hpp:58
static constexpr uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Definition: bits.hpp:651
wm_int.hpp contains a specialized class for a wavelet tree for sequences over large alphabets.
wt_huff.hpp contains a class for a Huffman shaped wavelet tree over byte sequences.