SDSL 3.0.1
Succinct Data Structure Library
construct.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
9#ifndef INCLUDED_SDSL_CONSTRUCT
10#define INCLUDED_SDSL_CONSTRUCT
11
12#include <string>
13
16#include <sdsl/construct_sa.hpp>
17#include <sdsl/int_vector.hpp>
19
20namespace sdsl
21{
22
23template <class int_vector>
24bool contains_no_zero_symbol(const int_vector & text, const std::string & file)
25{
26 for (int_vector_size_type i = 0; i < text.size(); ++i)
27 {
28 if ((uint64_t)0 == text[i])
29 {
30 throw std::logic_error(std::string("Error: File \"") + file + "\" contains zero symbol.");
31 return false;
32 }
33 }
34 return true;
35}
36
37template <class int_vector>
39{
40 text.resize(text.size() + 1);
41 text[text.size() - 1] = 0;
42}
43
44template <class t_index>
45void construct(t_index & idx, std::string file, uint8_t num_bytes = 0, bool move_input = false)
46{
47 tMSS file_map;
48 cache_config config;
49 if (is_ram_file(file))
50 {
51 config.dir = "@";
52 config.delete_data = move_input;
53 }
54 construct(idx, file, config, num_bytes);
55}
56
57template <class t_index, class t_data>
58void construct_im(t_index & idx, t_data && data, uint8_t num_bytes = 0)
59{
62 construct(idx, tmp_file, num_bytes, std::is_rvalue_reference<t_data &&>::value);
64}
65
67
76template <class t_index>
77void construct(t_index & idx, const std::string & file, cache_config & config, uint8_t num_bytes = 0)
78{
79 // delegate to CSA or CST construction
80 typename t_index::index_category index_tag;
81 construct(idx, file, config, num_bytes, index_tag);
82}
83
84// Specialization for WTs
85template <class t_index>
86void construct(t_index & idx, const std::string & file, cache_config & config, uint8_t num_bytes, wt_tag)
87{
88 auto event = memory_monitor::event("construct wavelet tree");
89 if ((t_index::alphabet_category::WIDTH == 8 and num_bytes <= 1) or
90 (t_index::alphabet_category::WIDTH == 0 and num_bytes != 'd'))
91 {
93 std::ios::in,
94 1024 * 1024,
95 num_bytes * 8,
96 (bool)num_bytes);
97 idx = t_index(text_buf.begin(), text_buf.end(), config.dir);
98 }
99 else
100 {
102 load_vector_from_file(text, file, num_bytes);
103 std::string tmp_key = util::to_string(util::pid()) + "_" + util::to_string(util::id());
104 std::string tmp_file_name = cache_file_name(tmp_key, config);
105 store_to_file(text, tmp_file_name);
106 util::clear(text);
107 {
109 idx = t_index(text_buf.begin(), text_buf.end(), config.dir);
110 }
111 sdsl::remove(tmp_file_name);
112 }
113}
114
115// Specialization for CSAs
116template <class t_index>
117void construct(t_index & idx, const std::string & file, cache_config & config, uint8_t num_bytes, csa_tag)
118{
119 auto event = memory_monitor::event("construct CSA");
120 constexpr uint8_t width = t_index::alphabet_category::WIDTH;
124 {
125 auto event = memory_monitor::event("parse input text");
126 // (1) check, if the text is cached
127 if (!cache_file_exists(KEY_TEXT, config))
128 {
129 text_type text;
130 load_vector_from_file(text, file, num_bytes);
131 if (contains_no_zero_symbol(text, file))
132 {
133 if (!is_ram_file(file))
134 {
135 append_zero_symbol(text);
136 store_to_cache(text, KEY_TEXT, config);
137 }
138 else
139 {
140 auto text_mapper = write_out_mapper<width>::create(cache_file_name(KEY_TEXT, config),
141 text.size() + 1,
142 text.width());
143 std::copy(text.begin(), text.end(), text_mapper.begin());
144 text_mapper[text.size()] = 0;
145 }
146 }
147 }
149 }
150 if (config.delete_data) { sdsl::remove(file); }
151 {
152 // (2) check, if the suffix array is cached
153 auto event = memory_monitor::event("SA");
154 if (!cache_file_exists(conf::KEY_SA, config)) { construct_sa<t_index::alphabet_category::WIDTH>(config); }
156 }
157 {
158 // (3) construct BWT
159 auto event = memory_monitor::event("BWT");
160 if (!cache_file_exists(KEY_BWT, config)) { construct_bwt<t_index::alphabet_category::WIDTH>(config); }
162 }
163 {
164 // (4) use BWT to construct the CSA
165 auto event = memory_monitor::event("construct CSA");
166 idx = t_index(config);
167 }
168 if (config.delete_files)
169 {
170 auto event = memory_monitor::event("delete temporary files");
171 util::delete_all_files(config.file_map);
172 }
173}
174
175// Specialization for standalone LCPs
176template <class t_index, uint8_t t_width>
177void construct(t_index & idx, const std::string & file, cache_config & config, uint8_t num_bytes, lcp_tag)
178{
179 auto event = memory_monitor::event("construct compressed LCP");
181 typedef int_vector<t_width> text_type;
182 {
183 // (2) check, if the longest common prefix array is cached
184 auto event = memory_monitor::event("LCP");
185 if (!cache_file_exists(conf::KEY_LCP, config))
186 {
187 {
188 auto event = memory_monitor::event("parse input text");
189 // (1) check, if the text is cached
190 if (!cache_file_exists(KEY_TEXT, config))
191 {
192 text_type text;
193 load_vector_from_file(text, file, num_bytes);
194 if (contains_no_zero_symbol(text, file))
195 {
196 append_zero_symbol(text);
197 store_to_cache(text, KEY_TEXT, config);
198 }
199 }
201 }
202 {
203 // (2) check, if the suffix array is cached
204 auto event = memory_monitor::event("SA");
205 if (!cache_file_exists(conf::KEY_SA, config)) { construct_sa<t_width>(config); }
207 }
208 if (t_width == 8) { construct_lcp_semi_extern_PHI(config); }
209 else
210 {
211 construct_lcp_PHI<t_width>(config);
212 }
213 }
215 }
216 {
217 auto event = memory_monitor::event("compressed LCP");
218 idx = t_index(config);
219 }
220 if (config.delete_files)
221 {
222 auto event = memory_monitor::event("delete temporary files");
223 util::delete_all_files(config.file_map);
224 }
225}
226
227// Specialization for standalone LCPs
228template <class t_index>
229void construct(t_index & idx, const std::string & file, cache_config & config, uint8_t num_bytes, lcp_tag tag)
230{
231 if (1 == num_bytes) { construct<t_index, 8>(idx, file, config, num_bytes, tag); }
232 else
233 {
234 construct<t_index, 0>(idx, file, config, num_bytes, tag);
235 }
236}
237
238// Specialization for CSTs
239template <class t_index>
240void construct(t_index & idx, const std::string & file, cache_config & config, uint8_t num_bytes, cst_tag)
241{
242 auto event = memory_monitor::event("construct CST");
245 csa_tag csa_t;
246 {
247 // (1) check, if the compressed suffix array is cached
248 typename t_index::csa_type csa;
249 if (!cache_file_exists(std::string(conf::KEY_CSA) + "_" + util::class_to_hash(csa), config))
250 {
251 cache_config csa_config(false, config.dir, config.id, config.file_map);
252 construct(csa, file, csa_config, num_bytes, csa_t);
253 auto event = memory_monitor::event("store CSA");
254 config.file_map = csa_config.file_map;
255 store_to_cache(csa, std::string(conf::KEY_CSA) + "_" + util::class_to_hash(csa), config);
256 }
257 register_cache_file(std::string(conf::KEY_CSA) + "_" + util::class_to_hash(csa), config);
258 }
259 {
260 // (2) check, if the longest common prefix array is cached
261 auto event = memory_monitor::event("LCP");
265 if (!cache_file_exists(conf::KEY_LCP, config))
266 {
267 if (t_index::alphabet_category::WIDTH == 8) { construct_lcp_semi_extern_PHI(config); }
268 else
269 {
270 construct_lcp_PHI<t_index::alphabet_category::WIDTH>(config);
271 }
272 }
274 }
275 {
276 auto event = memory_monitor::event("CST");
277 idx = t_index(config);
278 }
279 if (config.delete_files)
280 {
281 auto event = memory_monitor::event("delete temporary files");
282 util::delete_all_files(config.file_map);
283 }
284}
285
286} // end namespace sdsl
287#endif
A generic vector class for integers of width .
Definition: int_vector.hpp:216
size_type size() const noexcept
The number of elements in the int_vector.
void resize(const size_type size)
Resize the int_vector in terms of elements.
Definition: int_vector.hpp:521
static mm_event_proxy event(const std::string &name)
static int_vector_mapper< t_width > create(const std::string &key, cache_config &config)
construct_bwt.hpp contains a space and time efficient construction method for the Burrows and Wheeler...
construct_lcp.hpp contains a space and time efficient construction method for lcp arrays
construct_sa.hpp contains an interface to access suffix array construction algorithms
int_vector.hpp contains the sdsl::int_vector class.
constexpr char KEY_CSA[]
Definition: config.hpp:38
constexpr char KEY_SA[]
Definition: config.hpp:37
constexpr char KEY_TEXT[]
Definition: config.hpp:41
constexpr char KEY_LCP[]
Definition: config.hpp:44
constexpr char KEY_BWT[]
Definition: config.hpp:35
int remove(const std::string &name)
Remove the file with key name
Definition: ram_fs.hpp:69
uint64_t id()
uint64_t pid()
std::string to_string(const T &t, int w=1)
Namespace for the succinct data structure library.
std::string cache_file_name(const std::string &key, const cache_config &config)
Returns the file name of the resource.
Definition: io.hpp:630
bool is_ram_file(const std::string &file)
Determines if the given file is a RAM-file.
Definition: ram_fs.hpp:158
std::map< std::string, std::string > tMSS
Definition: config.hpp:50
std::string ram_file_name(const std::string &file)
Returns the corresponding RAM-file name for file.
Definition: ram_fs.hpp:174
bool load_vector_from_file(t_int_vec &v, const std::string &file, uint8_t num_bytes=1, uint8_t max_int_width=64)
from disk.
Definition: io.hpp:187
std::string tmp_file(const cache_config &config, std::string name_part="")
Returns a name for a temporary file. I.e. the name was not used before.
Definition: io.hpp:698
bool contains_no_zero_symbol(const int_vector &text, const std::string &file)
Definition: construct.hpp:24
void construct(t_index &idx, std::string file, uint8_t num_bytes=0, bool move_input=false)
Definition: construct.hpp:45
void construct_lcp_semi_extern_PHI(cache_config &config)
Construct the LCP array (only for byte strings)
uint64_t int_vector_size_type
Definition: config.hpp:48
void register_cache_file(const std::string &key, cache_config &config)
Register the existing resource specified by the key to the cache.
Definition: io.hpp:656
bool store_to_file(const T &v, const std::string &file)
Store a data structure to a file.
Definition: io.hpp:798
bool cache_file_exists(const std::string &key, const cache_config &config)
Checks if the resource specified by the key exists in the cache.
Definition: io.hpp:672
bool store_to_cache(const T &v, const std::string &key, cache_config &config, bool add_type_hash=false)
Stores the object v as a resource in the cache.
Definition: io.hpp:737
int remove(const std::string &)
Remove a file.
Definition: ram_fs.hpp:194
void append_zero_symbol(int_vector &text)
Definition: construct.hpp:38
void construct_im(t_index &idx, t_data &&data, uint8_t num_bytes=0)
Definition: construct.hpp:58
Contains declarations and definitions of data structure concepts.
Helper class for construction process.
Definition: config.hpp:67
std::string id
Definition: config.hpp:72
std::string dir
Definition: config.hpp:71
Helper classes to transform width=0 and width=8 to corresponding bwt key.
Definition: config.hpp:110
Helper classes to transform width=0 and width=8 to corresponding text key.
Definition: config.hpp:91