SeqAn3  3.0.3
The Modern C++ library for sequence analysis.
fm_index.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <seqan3/std/algorithm>
16 #include <seqan3/std/filesystem>
17 #include <seqan3/std/ranges>
18 
19 #include <sdsl/suffix_trees.hpp>
20 
28 
29 namespace seqan3::detail
30 {
35 {
50  template <semialphabet alphabet_t, text_layout text_layout_mode_, std::ranges::range text_t>
51  static void validate(text_t && text)
52  {
53  if constexpr (text_layout_mode_ == text_layout::single)
54  {
55  static_assert(std::ranges::bidirectional_range<text_t>, "The text must model bidirectional_range.");
56  static_assert(std::convertible_to<range_innermost_value_t<text_t>, alphabet_t>,
57  "The alphabet of the text collection must be convertible to the alphabet of the index.");
58  static_assert(range_dimension_v<text_t> == 1, "The input cannot be a text collection.");
59 
60  if (std::ranges::empty(text))
61  throw std::invalid_argument("The text to index cannot be empty.");
62  }
63  else
64  {
65  static_assert(std::ranges::bidirectional_range<text_t>,
66  "The text collection must model bidirectional_range.");
67  static_assert(std::ranges::bidirectional_range<std::ranges::range_reference_t<text_t>>,
68  "The elements of the text collection must model bidirectional_range.");
69  static_assert(std::convertible_to<range_innermost_value_t<text_t>, alphabet_t>,
70  "The alphabet of the text collection must be convertible to the alphabet of the index.");
71  static_assert(range_dimension_v<text_t> == 2, "The input must be a text collection.");
72 
73  if (std::ranges::empty(text))
74  throw std::invalid_argument("The text collection to index cannot be empty.");
75  }
76  static_assert(alphabet_size<range_innermost_value_t<text_t>> <= 256, "The alphabet is too big.");
77  }
78 };
79 } // namespace seqan3::detail
80 
81 namespace seqan3
82 {
83 
85 // forward declarations
86 template <typename index_t>
87 class fm_index_cursor;
88 
89 template <typename index_t>
90 class bi_fm_index_cursor;
91 
92 namespace detail
93 {
94 template <semialphabet alphabet_t,
95  text_layout text_layout_mode_,
96  detail::sdsl_index sdsl_index_type_>
97 class reverse_fm_index;
98 }
100 
137  sdsl::csa_wt<sdsl::wt_blcd<sdsl::bit_vector, // Wavelet tree type
138  sdsl::rank_support_v<>,
139  sdsl::select_support_scan<>,
140  sdsl::select_support_scan<0>>,
141  16, // Sampling rate of the suffix array
142  10'000'000, // Sampling rate of the inverse suffix array
143  sdsl::sa_order_sa_sampling<>, // How to sample positions in the suffix array (text VS SA sampling)
144  sdsl::isa_sampling<>, // How to sample positons in the inverse suffix array
145  sdsl::plain_byte_alphabet>; // How to represent the alphabet
146 
152 
190 template <semialphabet alphabet_t,
191  text_layout text_layout_mode_,
192  detail::sdsl_index sdsl_index_type_ = default_sdsl_index_type>
193 class fm_index
194 {
195 private:
200  using sdsl_index_type = sdsl_index_type_;
204  using sdsl_char_type = typename sdsl_index_type::alphabet_type::char_type;
206  using sdsl_sigma_type = typename sdsl_index_type::alphabet_type::sigma_type;
208 
209  friend class detail::reverse_fm_index<alphabet_t, text_layout_mode_, sdsl_index_type_>;
210 
213 
215  sdsl::sd_vector<> text_begin;
217  sdsl::select_support_sd<1> text_begin_ss;
219  sdsl::rank_support_sd<1> text_begin_rs;
220 
240  template <std::ranges::range text_t>
242  requires (text_layout_mode_ == text_layout::single)
244  void construct(text_t && text)
245  {
246  detail::fm_index_validator::validate<alphabet_t, text_layout_mode_>(text);
247 
248  constexpr auto sigma = alphabet_size<alphabet_t>;
249 
250  // TODO:
251  // * check what happens in sdsl when constructed twice!
252  // * choose between in-memory/external and construction algorithms
253  // * sdsl construction currently only works for int_vector, std::string and char *, not ranges in general
254  // uint8_t largest_char = 0;
255  sdsl::int_vector<8> tmp_text(std::ranges::distance(text));
256 
257  std::ranges::move(text
259  | std::views::transform([] (uint8_t const r)
260  {
261  if constexpr (sigma == 256)
262  {
263  if (r == 255)
264  throw std::out_of_range("The input text cannot be indexed, because for full"
265  "character alphabets the last one/two values are reserved"
266  "(single sequence/collection).");
267  }
268  return r + 1;
269  })
270  | std::views::reverse,
271  std::ranges::begin(tmp_text)); // reverse and increase rank by one
272 
273  sdsl::construct_im(index, tmp_text, 0);
274 
275  // TODO: would be nice but doesn't work since it's private and the public member references are const
276  // index.m_C.resize(largest_char);
277  // index.m_C.shrink_to_fit();
278  // index.m_sigma = largest_char;
279  }
280 
282  template <std::ranges::range text_t>
284  requires (text_layout_mode_ == text_layout::collection)
286  void construct(text_t && text, bool reverse = false)
287  {
288  detail::fm_index_validator::validate<alphabet_t, text_layout_mode_>(text);
289 
290  std::vector<size_t> text_sizes;
291 
292  for (auto && t : text)
293  text_sizes.push_back(std::ranges::distance(t));
294 
295  size_t const number_of_texts{text_sizes.size()};
296 
297  // text size including delimiters
298  size_t const text_size = std::accumulate(text_sizes.begin(), text_sizes.end(), 0) + number_of_texts;
299 
300  if (number_of_texts == text_size)
301  throw std::invalid_argument("A text collection that only contains empty texts cannot be indexed.");
302 
303  constexpr auto sigma = alphabet_size<alphabet_t>;
304 
305  // Instead of creating a bitvector of size `text_size`, setting the bits to 1 and then compressing it, we can
306  // use the `sd_vector_builder(text_size, number_of_ones)` because we know the parameters and the 1s we want to
307  // set are in a strictly increasing order. This inplace construction of the compressed vector saves memory.
308  sdsl::sd_vector_builder builder(text_size, number_of_texts);
309  size_t prefix_sum{0};
310 
311  for (auto && size : text_sizes)
312  {
313  builder.set(prefix_sum);
314  prefix_sum += size + 1;
315  }
316 
317  text_begin = sdsl::sd_vector<>(builder);
318  text_begin_ss = sdsl::select_support_sd<1>(&text_begin);
319  text_begin_rs = sdsl::rank_support_sd<1>(&text_begin);
320 
321  // last text in collection needs no delimiter if we have more than one text in the collection
322  sdsl::int_vector<8> tmp_text(text_size - (number_of_texts > 1));
323 
324  constexpr uint8_t delimiter = sigma >= 255 ? 255 : sigma + 1;
325 
326  std::ranges::move(text
327  | views::deep{views::to_rank}
328  | views::deep
329  {
330  std::views::transform([] (uint8_t const r)
331  {
332  if constexpr (sigma >= 255)
333  {
334  if (r >= 254)
335  throw std::out_of_range("The input text cannot be indexed, because"
336  " for full character alphabets the last one/"
337  "two values are reserved (single sequence/"
338  "collection).");
339  }
340  return r + 1;
341  })
342  }
343  | views::join(delimiter),
344  std::ranges::begin(tmp_text));
345 
346  if (!reverse)
347  {
348  // we need at least one delimiter
349  if (number_of_texts == 1)
350  tmp_text.back() = delimiter;
351 
352  std::ranges::reverse(tmp_text);
353  }
354  else
355  {
356  // If only one text is in the text collection, we still need one delimiter at the end to be able to
357  // conduct rank and select queries when locating hits in the index.
358  // Also, tmp_text looks like [text|0], but after reversing we need [txet|0] to be able to add the delimiter.
359  if (number_of_texts == 1)
360  {
361  std::ranges::reverse(tmp_text.begin(), tmp_text.end() - 1);
362  tmp_text.back() = delimiter;
363  }
364  else
365  {
366  std::ranges::reverse(tmp_text);
367  }
368  }
369 
370  sdsl::construct_im(index, tmp_text, 0);
371  }
372 
373 public:
375  static constexpr text_layout text_layout_mode = text_layout_mode_;
376 
381  using alphabet_type = alphabet_t;
383  using size_type = typename sdsl_index_type::size_type;
387 
388  template <typename bi_fm_index_t>
389  friend class bi_fm_index_cursor;
390 
391  template <typename fm_index_t>
392  friend class fm_index_cursor;
393 
394  template <typename fm_index_t>
395  friend class detail::fm_index_cursor_node;
396 
400  fm_index() = default;
401 
403  fm_index(fm_index const & rhs) :
405  {
406  text_begin_ss.set_vector(&text_begin);
407  text_begin_rs.set_vector(&text_begin);
408  }
409 
411  fm_index(fm_index && rhs) :
414  {
415  text_begin_ss.set_vector(&text_begin);
416  text_begin_rs.set_vector(&text_begin);
417  }
418 
421  {
422  index = std::move(rhs.index);
426 
427  text_begin_ss.set_vector(&text_begin);
428  text_begin_rs.set_vector(&text_begin);
429 
430  return *this;
431  }
432 
433  ~fm_index() = default;
434 
443  template <std::ranges::bidirectional_range text_t>
444  explicit fm_index(text_t && text)
445  {
446  construct(std::forward<text_t>(text));
447  }
449 
461  size_type size() const noexcept
462  {
463  return index.size();
464  }
465 
477  bool empty() const noexcept
478  {
479  return size() == 0;
480  }
481 
493  bool operator==(fm_index const & rhs) const noexcept
494  {
495  // (void) rhs;
496  return (index == rhs.index) && (text_begin == rhs.text_begin);
497  }
498 
510  bool operator!=(fm_index const & rhs) const noexcept
511  {
512  return !(*this == rhs);
513  }
514 
529  cursor_type cursor() const noexcept
530  {
531  return {*this};
532  }
533 
541  template <cereal_archive archive_t>
542  void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
543  {
544  archive(index);
545  archive(text_begin);
546  archive(text_begin_ss);
547  text_begin_ss.set_vector(&text_begin);
548  archive(text_begin_rs);
549  text_begin_rs.set_vector(&text_begin);
550 
551  auto sigma = alphabet_size<alphabet_t>;
552  archive(sigma);
553  if (sigma != alphabet_size<alphabet_t>)
554  {
555  throw std::logic_error{"The fm_index was built over an alphabet of size " + std::to_string(sigma) +
556  " but it is being read into an fm_index with an alphabet of size " +
557  std::to_string(alphabet_size<alphabet_t>) + "."};
558  }
559 
560  bool tmp = text_layout_mode;
561  archive(tmp);
562  if (tmp != text_layout_mode)
563  {
564  throw std::logic_error{std::string{"The fm_index was built over a "} +
565  (tmp ? "text collection" : "single text") +
566  " but it is being read into an fm_index expecting a " +
567  (text_layout_mode ? "text collection." : "single text.")};
568  }
569  }
571 
572 };
573 
578 template <std::ranges::range text_t>
579 fm_index(text_t &&) -> fm_index<range_innermost_value_t<text_t>, text_layout{range_dimension_v<text_t> != 1}>;
581 
583 } // namespace seqan3
584 
585 namespace seqan3::detail
586 {
587 
600 template <semialphabet alphabet_t,
601  text_layout text_layout_mode,
602  detail::sdsl_index sdsl_index_type = default_sdsl_index_type>
603 class reverse_fm_index : public fm_index<alphabet_t, text_layout_mode, sdsl_index_type>
604 {
605 private:
607  template <std::ranges::range text_t>
608  void construct_(text_t && text)
609  {
610  if constexpr (text_layout_mode == text_layout::single)
611  {
612  auto reverse_text = text | std::views::reverse;
613  this->construct(reverse_text);
614  }
615  else
616  {
617  auto reverse_text = text | views::deep{std::views::reverse} | std::views::reverse;
618  this->construct(reverse_text, true);
619  }
620  }
621 
622 public:
624 
626  template <std::ranges::bidirectional_range text_t>
627  explicit reverse_fm_index(text_t && text)
628  {
629  construct_(std::forward<text_t>(text));
630  }
631 
632 };
633 
634 } // namespace seqan3::detail
T accumulate(T... args)
Adaptations of algorithms from the Ranges TS.
T begin(T... args)
The SeqAn Bidirectional FM Index Cursor.
Definition: bi_fm_index_cursor.hpp:58
An FM Index specialisation that handles reversing the given text.
Definition: fm_index.hpp:604
void construct_(text_t &&text)
Constructs the index given a range. The range cannot be an rvalue (i.e. a temporary object) and has t...
Definition: fm_index.hpp:608
reverse_fm_index(text_t &&text)
Constructor that immediately constructs the index given a range. The range cannot be empty.
Definition: fm_index.hpp:627
The SeqAn FM Index Cursor.
Definition: fm_index_cursor.hpp:90
The SeqAn FM Index.
Definition: fm_index.hpp:194
sdsl_index_type_ sdsl_index_type
The type of the underlying SDSL index.
Definition: fm_index.hpp:200
fm_index()=default
Defaulted.
static constexpr text_layout text_layout_mode
Indicates whether index is built over a collection.
Definition: fm_index.hpp:375
cursor_type cursor() const noexcept
Returns a seqan3::fm_index_cursor on the index that can be used for searching. Cursor is pointing to ...
Definition: fm_index.hpp:529
size_type size() const noexcept
Returns the length of the indexed text including sentinel characters.
Definition: fm_index.hpp:461
sdsl::rank_support_sd< 1 > text_begin_rs
Rank support for text_begin.
Definition: fm_index.hpp:219
sdsl_index_type index
Underlying index from the SDSL.
Definition: fm_index.hpp:212
bool operator!=(fm_index const &rhs) const noexcept
Compares two indices.
Definition: fm_index.hpp:510
typename sdsl_index_type::alphabet_type::sigma_type sdsl_sigma_type
The type of the alphabet size of the underlying SDSL index.
Definition: fm_index.hpp:206
bool empty() const noexcept
Checks whether the index is empty.
Definition: fm_index.hpp:477
bool operator==(fm_index const &rhs) const noexcept
Compares two indices.
Definition: fm_index.hpp:493
typename sdsl_index_type::alphabet_type::char_type sdsl_char_type
The type of the reduced alphabet type. (The reduced alphabet might be smaller than the original alpha...
Definition: fm_index.hpp:204
fm_index & operator=(fm_index rhs)
When copy/move assigning, also update internal data structures.
Definition: fm_index.hpp:420
~fm_index()=default
Defaulted.
alphabet_t alphabet_type
The type of the underlying character of the indexed text.
Definition: fm_index.hpp:381
fm_index(fm_index &&rhs)
When move constructing, also update internal data structures.
Definition: fm_index.hpp:411
void construct(text_t &&text)
Constructs the index given a range. The range cannot be an rvalue (i.e. a temporary object) and has t...
Definition: fm_index.hpp:244
typename sdsl_index_type::size_type size_type
Type for representing positions in the indexed text.
Definition: fm_index.hpp:383
sdsl::sd_vector text_begin
Bitvector storing begin positions for collections.
Definition: fm_index.hpp:215
sdsl::select_support_sd< 1 > text_begin_ss
Select support for text_begin.
Definition: fm_index.hpp:217
fm_index(text_t &&text)
Constructor that immediately constructs the index given a range. The range cannot be empty.
Definition: fm_index.hpp:444
fm_index(fm_index const &rhs)
When copy constructing, also update internal data structures.
Definition: fm_index.hpp:403
Provides various transformation traits used by the range module.
Provides the internal representation of a node of the seqan3::fm_index_cursor.
T end(T... args)
This header includes C++17 filesystem support and imports it into namespace std::filesystem (independ...
Provides the seqan3::fm_index_cursor for searching in the unidirectional seqan3::fm_index.
constexpr auto alphabet_size
A type trait that holds the size of a (semi-)alphabet.
Definition: concept.hpp:730
typename range_innermost_value< t >::type range_innermost_value_t
Shortcut for seqan3::range_innermost_value (transformation_trait shortcut).
Definition: type_traits.hpp:240
text_layout
The possible text layouts (single, collection) the seqan3::fm_index and seqan3::bi_fm_index can suppo...
Definition: concept.hpp:82
sdsl_wt_index_type default_sdsl_index_type
The default FM Index Configuration.
Definition: fm_index.hpp:151
sdsl::csa_wt< sdsl::wt_blcd< sdsl::bit_vector, sdsl::rank_support_v<>, sdsl::select_support_scan<>, sdsl::select_support_scan< 0 > >, 16, 10 '000 '000, sdsl::sa_order_sa_sampling<>, sdsl::isa_sampling<>, sdsl::plain_byte_alphabet > sdsl_wt_index_type
The FM Index Configuration using a Wavelet Tree.
Definition: fm_index.hpp:145
fm_index(text_t &&) -> fm_index< range_innermost_value_t< text_t >, text_layout
Deduces the alphabet and dimensions of the text.
Definition: fm_index.hpp:579
@ single
The text is a single range.
Definition: concept.hpp:84
@ collection
The text is a range of ranges.
Definition: concept.hpp:86
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:434
auto const to_rank
A view that calls seqan3::to_rank() on each element in the input range.
Definition: to_rank.hpp:67
auto const move
A view that turns lvalue-references into rvalue-references.
Definition: move.hpp:70
The basis for seqan3::alphabet, but requires only rank interface (not char).
Provides seqan3::views::join.
The internal SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
SeqAn specific customisations in the standard namespace.
#define CEREAL_SERIALIZE_FUNCTION_NAME
Macro for Cereal's serialize function.
Definition: platform.hpp:134
T push_back(T... args)
Adaptations of concepts from the Ranges TS.
Provides the concept for seqan3::detail::sdsl_index.
T size(T... args)
Internal representation of the node of an FM index cursor.
Definition: fm_index_cursor.hpp:34
Class used to validate the requirements on the input text of the fm_index.
Definition: fm_index.hpp:35
static void validate(text_t &&text)
Validates the fm_index template parameters and text.
Definition: fm_index.hpp:51
Provides seqan3::views::to.
Provides seqan3::views::to_rank.
T to_string(T... args)