SDSL 3.0.1
Succinct Data Structure Library
lcp_support_tree2.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
4#ifndef INCLUDED_SDSL_SUPPORT_TREE2
5#define INCLUDED_SDSL_SUPPORT_TREE2
6
7#include <iostream>
8#include <string>
9
10#include <sdsl/lcp.hpp>
13#include <sdsl/util.hpp>
14#include <sdsl/wt_huff.hpp>
15
16namespace sdsl
17{
18
19// Forward declaration of helper method
20template <uint32_t t_dens, uint8_t t_bwt_width>
21void construct_first_child_and_lf_lcp(int_vector_buffer<> &,
22 int_vector_buffer<t_bwt_width> &,
23 const std::string &,
24 const std::string &,
25 int_vector<> &);
26
36template <uint32_t t_dens, class t_cst>
38{
39 public:
46 typedef const pointer const_pointer;
49 typedef t_cst cst_type;
51
53
54 enum
55 {
58 sa_order = 0
59 };
60
61 template <class CST>
62 struct type
63 {
65 };
66
67 private:
68 const cst_type * m_cst;
69 small_lcp_type m_small_lcp; // vector for lcp values < 254
70 int_vector<> m_big_lcp; // vector for lcp values >= 254
71
72 public:
75
81
83
86 _lcp_support_tree2(cache_config & config, const cst_type * cst = nullptr)
87 {
88 m_cst = cst;
89
91 std::string bwt_file = cache_file_name(key_bwt<t_cst::csa_type::alphabet_type::int_width>(), config);
93
94 std::string sml_lcp_file = tmp_file(config, "_fc_lf_lcp_sml");
95 std::string big_lcp_file = tmp_file(config, "_fc_lf_lcp_big");
96
97 construct_first_child_and_lf_lcp<t_dens>(lcp_buf, bwt_buf, sml_lcp_file, big_lcp_file, m_big_lcp);
98 int_vector_buffer<8> sml_lcp_buf(sml_lcp_file);
99
100 m_small_lcp = small_lcp_type(sml_lcp_buf.begin(), sml_lcp_buf.end(), config.dir);
101 sml_lcp_buf.close(true);
102 sdsl::remove(big_lcp_file);
103 }
104
105 void set_cst(const cst_type * cst) { m_cst = cst; }
106
107 size_type size() const { return m_cst->size(); }
108
110
111 size_type empty() const { return m_small_lcp.empty(); }
112
114 const_iterator begin() const { return const_iterator(this, 0); }
115
117 const_iterator end() const { return const_iterator(this, size()); }
118
120
125 {
126 size_type idx, offset = 0;
127 uint8_t val;
128 start:
129 idx = m_cst->tlcp_idx(i);
130 val = m_small_lcp[idx];
131 if (val < 254)
132 {
133 return val; // - offset;
134 }
135 else if (val == 254)
136 { // if lcp value is >= 254 and position i is reducible
137 i = m_cst->csa.lf[i]; // i = LF[i] // (*m_psi)(i);
138 ++offset; // goto lcp value, which is one bigger
139 goto start;
140 }
141 else
142 { // if lcp value is >= 254 and (not reducable or sampled)
143 return m_big_lcp[m_small_lcp.rank(idx, 255)] - offset;
144 }
145 }
146
148 size_type serialize(std::ostream & out, structure_tree_node * v = nullptr, std::string name = "") const
149 {
150 structure_tree_node * child = structure_tree::add_child(v, name, util::class_name(*this));
151 size_type written_bytes = 0;
152 written_bytes += m_small_lcp.serialize(out, child, "small_lcp");
153 written_bytes += m_big_lcp.serialize(out, child, "large_lcp");
154 structure_tree::add_size(child, written_bytes);
155 return written_bytes;
156 }
157
159 void load(std::istream & in, const t_cst * cst = nullptr)
160 {
161 m_small_lcp.load(in);
162 m_big_lcp.load(in);
163 m_cst = cst;
164 }
165
166 template <typename archive_t>
167 void CEREAL_SAVE_FUNCTION_NAME(archive_t & ar) const
168 {
169 ar(CEREAL_NVP(m_small_lcp));
170 ar(CEREAL_NVP(m_big_lcp));
171 }
172
173 template <typename archive_t>
174 void CEREAL_LOAD_FUNCTION_NAME(archive_t & ar)
175 {
176 ar(CEREAL_NVP(m_small_lcp));
177 ar(CEREAL_NVP(m_big_lcp));
178 }
179
181 bool operator==(_lcp_support_tree2 const & other) const noexcept
182 {
183 return (m_small_lcp == other.m_small_lcp) && (m_big_lcp == other.m_big_lcp);
184 }
185
187 bool operator!=(_lcp_support_tree2 const & other) const noexcept { return !(*this == other); }
188};
189
191template <uint32_t t_dens = 16>
193{
194 template <class t_cst>
196};
197
203template <uint32_t t_dens, uint8_t t_bwt_width>
206 const std::string & small_lcp_file,
207 const std::string & big_lcp_file,
208 int_vector<> & big_lcp)
209{
211 const size_type M = 255; // limit for values represented in the small LCP part
212 size_type buf_len = 1000000;
213 lcp_buf.buffersize(buf_len);
214 bwt_buf.buffersize(buf_len);
215 size_type n = lcp_buf.size();
216
217 int_vector_buffer<8> sml_lcp_out(small_lcp_file, std::ios::out);
218
219 osfstream big_lcp_out(big_lcp_file, std::ios::out | std::ios::trunc | std::ios::binary);
220
221 size_type fc_cnt = 0; // number of lcp values at the first child r
222 size_type fc_cnt_big = 0; // number of lcp values at the first child which are big and not reducible
223 size_type fc_cnt_big2 = 0;
224 sorted_multi_stack_support vec_stack(n); // occupies 2n bits
225 bit_vector is_big_and_not_reducable(n, 0); // initialized with 0s
226 bool is_one_big_and_not_reducable = false; // all positions have to be reducible
227
228 size_type y, max_lcp = 0;
229 uint64_t last_bwti = 0, val;
230 for (size_type i = 0, x; i < n; ++i)
231 {
232 x = lcp_buf[i];
233 is_one_big_and_not_reducable = false;
234
235 while (!vec_stack.empty() and x < vec_stack.top())
236 {
237 y = vec_stack.top();
238 is_one_big_and_not_reducable |= is_big_and_not_reducable[vec_stack.size() - 1];
239 if (vec_stack.pop())
240 { // if y was the last copy of y on the stack
241 if (y > M - 2)
242 {
243 if (is_one_big_and_not_reducable)
244 {
245 val = M;
246 big_lcp_out.write((char *)&y, sizeof(y));
247 ++fc_cnt_big;
248 if (y > max_lcp) max_lcp = y;
249 }
250 else
251 {
252 val = M - 1;
253 ++fc_cnt_big2;
254 }
255 }
256 else
257 {
258 val = y;
259 }
260 sml_lcp_out.push_back(val);
261 ++fc_cnt;
262 is_one_big_and_not_reducable = false;
263 }
264 }
265 if (x > M - 2 and (0 == i or last_bwti != bwt_buf[i] or x % t_dens == 0))
266 {
267 is_big_and_not_reducable[vec_stack.size()] = 1;
268 }
269 else
270 {
271 is_big_and_not_reducable[vec_stack.size()] = 0;
272 }
273 vec_stack.push(x);
274 last_bwti = bwt_buf[i];
275 }
276
277 while (!vec_stack.empty())
278 {
279 y = vec_stack.top();
280 if (vec_stack.pop())
281 {
282 if (y > M - 2)
283 {
284 if (is_big_and_not_reducable[vec_stack.size()])
285 {
286 val = M;
287 big_lcp_out.write((char *)&y, sizeof(y));
288 ++fc_cnt_big;
289 if (y > max_lcp) max_lcp = y;
290 }
291 else
292 {
293 val = M - 1;
294 ++fc_cnt_big2;
295 }
296 }
297 else
298 {
299 val = y;
300 }
301 sml_lcp_out.push_back(val);
302 ++fc_cnt;
303 }
304 }
305
306 big_lcp_out.close();
307 isfstream big_lcp_in(big_lcp_file, std::ios::in | std::ios::binary);
308 big_lcp.width(bits::hi(max_lcp) + 1);
309 big_lcp.resize(fc_cnt_big);
310
311 for (size_type i = 0; i < fc_cnt_big; ++i)
312 {
313 big_lcp_in.read((char *)&y, sizeof(y));
314 big_lcp[i] = y;
315 }
316}
317
318} // namespace sdsl
319#endif
#define CEREAL_NVP(X)
Definition: cereal.hpp:31
An lcp array class for cst_sct3 and cst_sada.
const_iterator end() const
Returns a const_iterator to the element after the last element.
_lcp_support_tree2()
Default constructor.
random_access_const_iterator< _lcp_support_tree2 > const_iterator
_lcp_support_tree2(cache_config &config, const cst_type *cst=nullptr)
Constructor.
_lcp_support_tree2 & operator=(const _lcp_support_tree2 &)=default
int_vector ::size_type size_type
void CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
_lcp_support_tree2 & operator=(_lcp_support_tree2 &&)=default
lcp_tree_and_lf_compressed_tag lcp_category
bool operator!=(_lcp_support_tree2 const &other) const noexcept
Inequality operator.
wt_huff< bit_vector, rank_support_v5<>, select_support_scan< 1 >, select_support_scan< 0 > > small_lcp_type
void set_cst(const cst_type *cst)
_lcp_support_tree2(_lcp_support_tree2 &&)=default
int_vector ::value_type value_type
value_type operator[](size_type i) const
[]-operator
void CEREAL_LOAD_FUNCTION_NAME(archive_t &ar)
const value_type const_reference
bool operator==(_lcp_support_tree2 const &other) const noexcept
Equality operator.
const_iterator begin() const
Returns a const_iterator to the first element.
_lcp_support_tree2(const _lcp_support_tree2 &)=default
Copy / Move constructor.
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serialize to a stream.
int_vector ::difference_type difference_type
void load(std::istream &in, const t_cst *cst=nullptr)
Load from a stream.
uint64_t size() const
Returns the number of elements currently stored.
void close(bool remove_file=false)
Close the int_vector_buffer.
uint64_t buffersize() const
Returns the buffersize in bytes.
void push_back(const uint64_t value)
Appends the given element value to the end of the int_vector_buffer.
void load(std::istream &in)
Load the int_vector for a stream.
uint8_t width() const noexcept
Returns the width of the integers which are accessed via the [] operator.
Definition: int_vector.hpp:595
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serializes the int_vector to a stream.
static size_type max_size() noexcept
Maximum size of the int_vector.
Definition: int_vector.hpp:542
void resize(const size_type size)
Resize the int_vector in terms of elements.
Definition: int_vector.hpp:521
void close()
Close the stream.
Definition: sfstream.hpp:87
Generic iterator for a random access container.
Definition: iterators.hpp:24
Stack which contains elements from [0..n] in sorted order. Duplicates are possible.
size_type size() const
Returns the number of element is the stack.
bool empty() const
Returns if the stack is empty.
size_type top() const
Returns the topmost index on the stack.
bool pop()
Pop the topmost index of the stack.
bool push(size_type x)
Push the index x of vector vec onto the stack.
static structure_tree_node * add_child(structure_tree_node *v, const std::string &name, const std::string &type)
static void add_size(structure_tree_node *v, uint64_t value)
A prefix code-shaped wavelet.
Definition: wt_pc.hpp:44
size_type rank(size_type i, value_type c) const
Calculates how many symbols c are in the prefix [0..i-1].
Definition: wt_pc.hpp:335
void load(std::istream &in)
Loads the data structure from the given istream.
Definition: wt_pc.hpp:676
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serializes the data structure into the given ostream.
Definition: wt_pc.hpp:660
bool empty() const
Returns whether the wavelet tree contains no data.
Definition: wt_pc.hpp:287
lcp.hpp contains classes for lcp information.
constexpr char KEY_LCP[]
Definition: config.hpp:44
int_vector ::size_type size_type
Namespace for the succinct data structure library.
std::string cache_file_name(const std::string &key, const cache_config &config)
Returns the file name of the resource.
Definition: io.hpp:630
std::string tmp_file(const cache_config &config, std::string name_part="")
Returns a name for a temporary file. I.e. the name was not used before.
Definition: io.hpp:698
int remove(const std::string &)
Remove a file.
Definition: ram_fs.hpp:194
void construct_first_child_and_lf_lcp(int_vector_buffer<> &, int_vector_buffer< t_bwt_width > &, const std::string &, const std::string &, int_vector<> &)
rank_support_v.hpp contains rank_support_v.
sorted_multi_stack_support.hpp contains a data structure for a stack which contains elements from [0....
static constexpr uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Definition: bits.hpp:651
Helper class for construction process.
Definition: config.hpp:67
std::string dir
Definition: config.hpp:71
Helper class which provides _lcp_support_tree2 the context of a CST.
util.hpp contains some helper methods for int_vector and other stuff like demangle class names.
wt_huff.hpp contains a class for a Huffman shaped wavelet tree over byte sequences.