You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and dots ('.'), can be up to 35 characters long. Letters must be lowercase.
1113 lines
47 KiB
1113 lines
47 KiB
// |
|
// Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) |
|
// |
|
// Distributed under the Boost Software License, Version 1.0. (See |
|
// accompanying file LICENSE_1_0.txt or copy at |
|
// http://www.boost.org/LICENSE_1_0.txt) |
|
// |
|
#ifndef BOOST_LOCALE_BOUNDARY_INDEX_HPP_INCLUDED |
|
#define BOOST_LOCALE_BOUNDARY_INDEX_HPP_INCLUDED |
|
|
|
#include <boost/locale/config.hpp> |
|
#include <boost/locale/boundary/types.hpp> |
|
#include <boost/locale/boundary/facets.hpp> |
|
#include <boost/locale/boundary/segment.hpp> |
|
#include <boost/locale/boundary/boundary_point.hpp> |
|
#include <boost/iterator/iterator_facade.hpp> |
|
#include <boost/shared_ptr.hpp> |
|
#include <boost/cstdint.hpp> |
|
#include <boost/assert.hpp> |
|
#ifdef BOOST_MSVC |
|
# pragma warning(push) |
|
# pragma warning(disable : 4275 4251 4231 4660) |
|
#endif |
|
#include <string> |
|
#include <locale> |
|
#include <vector> |
|
#include <iterator> |
|
#include <algorithm> |
|
#include <stdexcept> |
|
|
|
#include <iostream> |
|
|
|
namespace boost { |
|
|
|
namespace locale { |
|
|
|
namespace boundary { |
|
/// |
|
/// \defgroup boundary Boundary Analysis |
|
/// |
|
/// This module contains all operations required for %boundary analysis of text: character, word, like and sentence boundaries |
|
/// |
|
/// @{ |
|
/// |
|
|
|
/// \cond INTERNAL |
|
|
|
namespace details { |
|
|
|
template<typename IteratorType,typename CategoryType = typename std::iterator_traits<IteratorType>::iterator_category> |
|
struct mapping_traits { |
|
typedef typename std::iterator_traits<IteratorType>::value_type char_type; |
|
static index_type map(boundary_type t,IteratorType b,IteratorType e,std::locale const &l) |
|
{ |
|
std::basic_string<char_type> str(b,e); |
|
return std::use_facet<boundary_indexing<char_type> >(l).map(t,str.c_str(),str.c_str()+str.size()); |
|
} |
|
}; |
|
|
|
template<typename CharType,typename SomeIteratorType> |
|
struct linear_iterator_traits { |
|
static const bool is_linear = false; |
|
}; |
|
|
|
template<typename CharType> |
|
struct linear_iterator_traits<CharType,typename std::basic_string<CharType>::iterator> { |
|
static const bool is_linear = true; |
|
}; |
|
|
|
template<typename CharType> |
|
struct linear_iterator_traits<CharType,typename std::basic_string<CharType>::const_iterator> { |
|
static const bool is_linear = true; |
|
}; |
|
|
|
template<typename CharType> |
|
struct linear_iterator_traits<CharType,typename std::vector<CharType>::iterator> { |
|
static const bool is_linear = true; |
|
}; |
|
|
|
template<typename CharType> |
|
struct linear_iterator_traits<CharType,typename std::vector<CharType>::const_iterator> { |
|
static const bool is_linear = true; |
|
}; |
|
|
|
template<typename CharType> |
|
struct linear_iterator_traits<CharType,CharType *> { |
|
static const bool is_linear = true; |
|
}; |
|
|
|
template<typename CharType> |
|
struct linear_iterator_traits<CharType,CharType const *> { |
|
static const bool is_linear = true; |
|
}; |
|
|
|
|
|
template<typename IteratorType> |
|
struct mapping_traits<IteratorType,std::random_access_iterator_tag> { |
|
|
|
typedef typename std::iterator_traits<IteratorType>::value_type char_type; |
|
|
|
|
|
|
|
static index_type map(boundary_type t,IteratorType b,IteratorType e,std::locale const &l) |
|
{ |
|
index_type result; |
|
|
|
// |
|
// Optimize for most common cases |
|
// |
|
// C++0x requires that string is continious in memory and all known |
|
// string implementations |
|
// do this because of c_str() support. |
|
// |
|
|
|
if(linear_iterator_traits<char_type,IteratorType>::is_linear && b!=e) |
|
{ |
|
char_type const *begin = &*b; |
|
char_type const *end = begin + (e-b); |
|
index_type tmp=std::use_facet<boundary_indexing<char_type> >(l).map(t,begin,end); |
|
result.swap(tmp); |
|
} |
|
else { |
|
std::basic_string<char_type> str(b,e); |
|
index_type tmp = std::use_facet<boundary_indexing<char_type> >(l).map(t,str.c_str(),str.c_str()+str.size()); |
|
result.swap(tmp); |
|
} |
|
return result; |
|
} |
|
}; |
|
|
|
template<typename BaseIterator> |
|
class mapping { |
|
public: |
|
typedef BaseIterator base_iterator; |
|
typedef typename std::iterator_traits<base_iterator>::value_type char_type; |
|
|
|
|
|
mapping(boundary_type type, |
|
base_iterator begin, |
|
base_iterator end, |
|
std::locale const &loc) |
|
: |
|
index_(new index_type()), |
|
begin_(begin), |
|
end_(end) |
|
{ |
|
index_type idx=details::mapping_traits<base_iterator>::map(type,begin,end,loc); |
|
index_->swap(idx); |
|
} |
|
|
|
mapping() |
|
{ |
|
} |
|
|
|
index_type const &index() const |
|
{ |
|
return *index_; |
|
} |
|
|
|
base_iterator begin() const |
|
{ |
|
return begin_; |
|
} |
|
|
|
base_iterator end() const |
|
{ |
|
return end_; |
|
} |
|
|
|
private: |
|
boost::shared_ptr<index_type> index_; |
|
base_iterator begin_,end_; |
|
}; |
|
|
|
template<typename BaseIterator> |
|
class segment_index_iterator : |
|
public boost::iterator_facade< |
|
segment_index_iterator<BaseIterator>, |
|
segment<BaseIterator>, |
|
boost::bidirectional_traversal_tag, |
|
segment<BaseIterator> const & |
|
> |
|
{ |
|
public: |
|
typedef BaseIterator base_iterator; |
|
typedef mapping<base_iterator> mapping_type; |
|
typedef segment<base_iterator> segment_type; |
|
|
|
segment_index_iterator() : current_(0,0),map_(0) |
|
{ |
|
} |
|
|
|
segment_index_iterator(base_iterator p,mapping_type const *map,rule_type mask,bool full_select) : |
|
map_(map), |
|
mask_(mask), |
|
full_select_(full_select) |
|
{ |
|
set(p); |
|
} |
|
segment_index_iterator(bool is_begin,mapping_type const *map,rule_type mask,bool full_select) : |
|
map_(map), |
|
mask_(mask), |
|
full_select_(full_select) |
|
{ |
|
if(is_begin) |
|
set_begin(); |
|
else |
|
set_end(); |
|
} |
|
|
|
segment_type const &dereference() const |
|
{ |
|
return value_; |
|
} |
|
|
|
bool equal(segment_index_iterator const &other) const |
|
{ |
|
return map_ == other.map_ && current_.second == other.current_.second; |
|
} |
|
|
|
void increment() |
|
{ |
|
std::pair<size_t,size_t> next = current_; |
|
if(full_select_) { |
|
next.first = next.second; |
|
while(next.second < size()) { |
|
next.second++; |
|
if(valid_offset(next.second)) |
|
break; |
|
} |
|
if(next.second == size()) |
|
next.first = next.second - 1; |
|
} |
|
else { |
|
while(next.second < size()) { |
|
next.first = next.second; |
|
next.second++; |
|
if(valid_offset(next.second)) |
|
break; |
|
} |
|
} |
|
update_current(next); |
|
} |
|
|
|
void decrement() |
|
{ |
|
std::pair<size_t,size_t> next = current_; |
|
if(full_select_) { |
|
while(next.second >1) { |
|
next.second--; |
|
if(valid_offset(next.second)) |
|
break; |
|
} |
|
next.first = next.second; |
|
while(next.first >0) { |
|
next.first--; |
|
if(valid_offset(next.first)) |
|
break; |
|
} |
|
} |
|
else { |
|
while(next.second >1) { |
|
next.second--; |
|
if(valid_offset(next.second)) |
|
break; |
|
} |
|
next.first = next.second - 1; |
|
} |
|
update_current(next); |
|
} |
|
|
|
private: |
|
|
|
void set_end() |
|
{ |
|
current_.first = size() - 1; |
|
current_.second = size(); |
|
value_ = segment_type(map_->end(),map_->end(),0); |
|
} |
|
void set_begin() |
|
{ |
|
current_.first = current_.second = 0; |
|
value_ = segment_type(map_->begin(),map_->begin(),0); |
|
increment(); |
|
} |
|
|
|
void set(base_iterator p) |
|
{ |
|
size_t dist=std::distance(map_->begin(),p); |
|
index_type::const_iterator b=map_->index().begin(),e=map_->index().end(); |
|
index_type::const_iterator |
|
boundary_point=std::upper_bound(b,e,break_info(dist)); |
|
while(boundary_point != e && (boundary_point->rule & mask_)==0) |
|
boundary_point++; |
|
|
|
current_.first = current_.second = boundary_point - b; |
|
|
|
if(full_select_) { |
|
while(current_.first > 0) { |
|
current_.first --; |
|
if(valid_offset(current_.first)) |
|
break; |
|
} |
|
} |
|
else { |
|
if(current_.first > 0) |
|
current_.first --; |
|
} |
|
value_.first = map_->begin(); |
|
std::advance(value_.first,get_offset(current_.first)); |
|
value_.second = value_.first; |
|
std::advance(value_.second,get_offset(current_.second) - get_offset(current_.first)); |
|
|
|
update_rule(); |
|
} |
|
|
|
void update_current(std::pair<size_t,size_t> pos) |
|
{ |
|
std::ptrdiff_t first_diff = get_offset(pos.first) - get_offset(current_.first); |
|
std::ptrdiff_t second_diff = get_offset(pos.second) - get_offset(current_.second); |
|
std::advance(value_.first,first_diff); |
|
std::advance(value_.second,second_diff); |
|
current_ = pos; |
|
update_rule(); |
|
} |
|
|
|
void update_rule() |
|
{ |
|
if(current_.second != size()) { |
|
value_.rule(index()[current_.second].rule); |
|
} |
|
} |
|
size_t get_offset(size_t ind) const |
|
{ |
|
if(ind == size()) |
|
return index().back().offset; |
|
return index()[ind].offset; |
|
} |
|
|
|
bool valid_offset(size_t offset) const |
|
{ |
|
return offset == 0 |
|
|| offset == size() // make sure we not acess index[size] |
|
|| (index()[offset].rule & mask_)!=0; |
|
} |
|
|
|
size_t size() const |
|
{ |
|
return index().size(); |
|
} |
|
|
|
index_type const &index() const |
|
{ |
|
return map_->index(); |
|
} |
|
|
|
|
|
segment_type value_; |
|
std::pair<size_t,size_t> current_; |
|
mapping_type const *map_; |
|
rule_type mask_; |
|
bool full_select_; |
|
}; |
|
|
|
template<typename BaseIterator> |
|
class boundary_point_index_iterator : |
|
public boost::iterator_facade< |
|
boundary_point_index_iterator<BaseIterator>, |
|
boundary_point<BaseIterator>, |
|
boost::bidirectional_traversal_tag, |
|
boundary_point<BaseIterator> const & |
|
> |
|
{ |
|
public: |
|
typedef BaseIterator base_iterator; |
|
typedef mapping<base_iterator> mapping_type; |
|
typedef boundary_point<base_iterator> boundary_point_type; |
|
|
|
boundary_point_index_iterator() : current_(0),map_(0) |
|
{ |
|
} |
|
|
|
boundary_point_index_iterator(bool is_begin,mapping_type const *map,rule_type mask) : |
|
map_(map), |
|
mask_(mask) |
|
{ |
|
if(is_begin) |
|
set_begin(); |
|
else |
|
set_end(); |
|
} |
|
boundary_point_index_iterator(base_iterator p,mapping_type const *map,rule_type mask) : |
|
map_(map), |
|
mask_(mask) |
|
{ |
|
set(p); |
|
} |
|
|
|
boundary_point_type const &dereference() const |
|
{ |
|
return value_; |
|
} |
|
|
|
bool equal(boundary_point_index_iterator const &other) const |
|
{ |
|
return map_ == other.map_ && current_ == other.current_; |
|
} |
|
|
|
void increment() |
|
{ |
|
size_t next = current_; |
|
while(next < size()) { |
|
next++; |
|
if(valid_offset(next)) |
|
break; |
|
} |
|
update_current(next); |
|
} |
|
|
|
void decrement() |
|
{ |
|
size_t next = current_; |
|
while(next>0) { |
|
next--; |
|
if(valid_offset(next)) |
|
break; |
|
} |
|
update_current(next); |
|
} |
|
|
|
private: |
|
void set_end() |
|
{ |
|
current_ = size(); |
|
value_ = boundary_point_type(map_->end(),0); |
|
} |
|
void set_begin() |
|
{ |
|
current_ = 0; |
|
value_ = boundary_point_type(map_->begin(),0); |
|
} |
|
|
|
void set(base_iterator p) |
|
{ |
|
size_t dist = std::distance(map_->begin(),p); |
|
|
|
index_type::const_iterator b=index().begin(); |
|
index_type::const_iterator e=index().end(); |
|
index_type::const_iterator ptr = std::lower_bound(b,e,break_info(dist)); |
|
|
|
if(ptr==index().end()) |
|
current_=size()-1; |
|
else |
|
current_=ptr - index().begin(); |
|
|
|
while(!valid_offset(current_)) |
|
current_ ++; |
|
|
|
std::ptrdiff_t diff = get_offset(current_) - dist; |
|
std::advance(p,diff); |
|
value_.iterator(p); |
|
update_rule(); |
|
} |
|
|
|
void update_current(size_t pos) |
|
{ |
|
std::ptrdiff_t diff = get_offset(pos) - get_offset(current_); |
|
base_iterator i=value_.iterator(); |
|
std::advance(i,diff); |
|
current_ = pos; |
|
value_.iterator(i); |
|
update_rule(); |
|
} |
|
|
|
void update_rule() |
|
{ |
|
if(current_ != size()) { |
|
value_.rule(index()[current_].rule); |
|
} |
|
} |
|
size_t get_offset(size_t ind) const |
|
{ |
|
if(ind == size()) |
|
return index().back().offset; |
|
return index()[ind].offset; |
|
} |
|
|
|
bool valid_offset(size_t offset) const |
|
{ |
|
return offset == 0 |
|
|| offset + 1 >= size() // last and first are always valid regardless of mark |
|
|| (index()[offset].rule & mask_)!=0; |
|
} |
|
|
|
size_t size() const |
|
{ |
|
return index().size(); |
|
} |
|
|
|
index_type const &index() const |
|
{ |
|
return map_->index(); |
|
} |
|
|
|
|
|
boundary_point_type value_; |
|
size_t current_; |
|
mapping_type const *map_; |
|
rule_type mask_; |
|
}; |
|
|
|
|
|
} // details |
|
|
|
/// \endcond |
|
|
|
template<typename BaseIterator> |
|
class segment_index; |
|
|
|
template<typename BaseIterator> |
|
class boundary_point_index; |
|
|
|
|
|
/// |
|
/// \brief This class holds an index of segments in the text range and allows to iterate over them |
|
/// |
|
/// This class is provides \ref begin() and \ref end() member functions that return bidirectional iterators |
|
/// to the \ref segment objects. |
|
/// |
|
/// It provides two options on way of selecting segments: |
|
/// |
|
/// - \ref rule(rule_type mask) - a mask that allows to select only specific types of segments according to |
|
/// various masks %as \ref word_any. |
|
/// \n |
|
/// The default is to select any types of boundaries. |
|
/// \n |
|
/// For example: using word %boundary analysis, when the provided mask is \ref word_kana then the iterators |
|
/// would iterate only over the words containing Kana letters and \ref word_any would select all types of |
|
/// words excluding ranges that consist of white space and punctuation marks. So iterating over the text |
|
/// "to be or not to be?" with \ref word_any rule would return segments "to", "be", "or", "not", "to", "be", instead |
|
/// of default "to", " ", "be", " ", "or", " ", "not", " ", "to", " ", "be", "?". |
|
/// - \ref full_select(bool how) - a flag that defines the way a range is selected if the rule of the previous |
|
/// %boundary point does not fit the selected rule. |
|
/// \n |
|
/// For example: We want to fetch all sentences from the following text: "Hello! How\nare you?". |
|
/// \n |
|
/// This text contains three %boundary points separating it to sentences by different rules: |
|
/// - The exclamation mark "!" ends the sentence "Hello!" |
|
/// - The line feed that splits the sentence "How\nare you?" into two parts. |
|
/// - The question mark that ends the second sentence. |
|
/// \n |
|
/// If you would only change the \ref rule() to \ref sentence_term then the segment_index would |
|
/// provide two sentences "Hello!" and "are you?" %as only them actually terminated with required |
|
/// terminator "!" or "?". But changing \ref full_select() to true, the selected segment would include |
|
/// all the text up to previous valid %boundary point and would return two expected sentences: |
|
/// "Hello!" and "How\nare you?". |
|
/// |
|
/// This class allows to find a segment according to the given iterator in range using \ref find() member |
|
/// function. |
|
/// |
|
/// \note |
|
/// |
|
/// - Changing any of the options - \ref rule() or \ref full_select() and of course re-indexing the text |
|
/// invalidates existing iterators and they can't be used any more. |
|
/// - segment_index can be created from boundary_point_index or other segment_index that was created with |
|
/// same \ref boundary_type. This is very fast operation %as they shared same index |
|
/// and it does not require its regeneration. |
|
/// |
|
/// \see |
|
/// |
|
/// - \ref boundary_point_index |
|
/// - \ref segment |
|
/// - \ref boundary_point |
|
/// |
|
|
|
template<typename BaseIterator> |
|
class segment_index { |
|
public: |
|
|
|
/// |
|
/// The type of the iterator used to iterate over the original text |
|
/// |
|
typedef BaseIterator base_iterator; |
|
#ifdef BOOST_LOCALE_DOXYGEN |
|
/// |
|
/// The bidirectional iterator that iterates over \ref value_type objects. |
|
/// |
|
/// - The iterators may be invalidated by use of any non-const member function |
|
/// including but not limited to \ref rule(rule_type) and \ref full_select(bool). |
|
/// - The returned value_type object is valid %as long %as iterator points to it. |
|
/// So this following code is wrong %as t used after p was updated: |
|
/// \code |
|
/// segment_index<some_iterator>::iterator p=index.begin(); |
|
/// segment<some_iterator> &t = *p; |
|
/// ++p; |
|
/// cout << t.str() << endl; |
|
/// \endcode |
|
/// |
|
typedef unspecified_iterator_type iterator; |
|
/// |
|
/// \copydoc iterator |
|
/// |
|
typedef unspecified_iterator_type const_iterator; |
|
#else |
|
typedef details::segment_index_iterator<base_iterator> iterator; |
|
typedef details::segment_index_iterator<base_iterator> const_iterator; |
|
#endif |
|
/// |
|
/// The type dereferenced by the \ref iterator and \ref const_iterator. It is |
|
/// an object that represents selected segment. |
|
/// |
|
typedef segment<base_iterator> value_type; |
|
|
|
/// |
|
/// Default constructor. |
|
/// |
|
/// \note |
|
/// |
|
/// When this object is constructed by default it does not include a valid index, thus |
|
/// calling \ref begin(), \ref end() or \ref find() member functions would lead to undefined |
|
/// behavior |
|
/// |
|
segment_index() : mask_(0xFFFFFFFFu),full_select_(false) |
|
{ |
|
} |
|
/// |
|
/// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text |
|
/// in range [begin,end) using a rule \a mask for locale \a loc. |
|
/// |
|
segment_index(boundary_type type, |
|
base_iterator begin, |
|
base_iterator end, |
|
rule_type mask, |
|
std::locale const &loc=std::locale()) |
|
: |
|
map_(type,begin,end,loc), |
|
mask_(mask), |
|
full_select_(false) |
|
{ |
|
} |
|
/// |
|
/// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text |
|
/// in range [begin,end) selecting all possible segments (full mask) for locale \a loc. |
|
/// |
|
segment_index(boundary_type type, |
|
base_iterator begin, |
|
base_iterator end, |
|
std::locale const &loc=std::locale()) |
|
: |
|
map_(type,begin,end,loc), |
|
mask_(0xFFFFFFFFu), |
|
full_select_(false) |
|
{ |
|
} |
|
|
|
/// |
|
/// Create a segment_index from a \ref boundary_point_index. It copies all indexing information |
|
/// and used default rule (all possible segments) |
|
/// |
|
/// This operation is very cheap, so if you use boundary_point_index and segment_index on same text |
|
/// range it is much better to create one from another rather then indexing the same |
|
/// range twice. |
|
/// |
|
/// \note \ref rule() flags are not copied |
|
/// |
|
segment_index(boundary_point_index<base_iterator> const &); |
|
/// |
|
/// Copy an index from a \ref boundary_point_index. It copies all indexing information |
|
/// and uses the default rule (all possible segments) |
|
/// |
|
/// This operation is very cheap, so if you use boundary_point_index and segment_index on same text |
|
/// range it is much better to create one from another rather then indexing the same |
|
/// range twice. |
|
/// |
|
/// \note \ref rule() flags are not copied |
|
/// |
|
segment_index const &operator = (boundary_point_index<base_iterator> const &); |
|
|
|
|
|
/// |
|
/// Create a new index for %boundary analysis \ref boundary_type "type" of the text |
|
/// in range [begin,end) for locale \a loc. |
|
/// |
|
/// \note \ref rule() and \ref full_select() remain unchanged. |
|
/// |
|
void map(boundary_type type,base_iterator begin,base_iterator end,std::locale const &loc=std::locale()) |
|
{ |
|
map_ = mapping_type(type,begin,end,loc); |
|
} |
|
|
|
/// |
|
/// Get the \ref iterator on the beginning of the segments range. |
|
/// |
|
/// Preconditions: the segment_index should have a mapping |
|
/// |
|
/// \note |
|
/// |
|
/// The returned iterator is invalidated by access to any non-const member functions of this object |
|
/// |
|
iterator begin() const |
|
{ |
|
return iterator(true,&map_,mask_,full_select_); |
|
} |
|
|
|
/// |
|
/// Get the \ref iterator on the ending of the segments range. |
|
/// |
|
/// Preconditions: the segment_index should have a mapping |
|
/// |
|
/// The returned iterator is invalidated by access to any non-const member functions of this object |
|
/// |
|
iterator end() const |
|
{ |
|
return iterator(false,&map_,mask_,full_select_); |
|
} |
|
|
|
/// |
|
/// Find a first valid segment following a position \a p. |
|
/// |
|
/// If \a p is inside a valid segment this segment is selected: |
|
/// |
|
/// For example: For \ref word %boundary analysis with \ref word_any rule(): |
|
/// |
|
/// - "to| be or ", would point to "be", |
|
/// - "t|o be or ", would point to "to", |
|
/// - "to be or| ", would point to end. |
|
/// |
|
/// |
|
/// Preconditions: the segment_index should have a mapping and \a p should be valid iterator |
|
/// to the text in the mapped range. |
|
/// |
|
/// The returned iterator is invalidated by access to any non-const member functions of this object |
|
/// |
|
iterator find(base_iterator p) const |
|
{ |
|
return iterator(p,&map_,mask_,full_select_); |
|
} |
|
|
|
/// |
|
/// Get the mask of rules that are used |
|
/// |
|
rule_type rule() const |
|
{ |
|
return mask_; |
|
} |
|
/// |
|
/// Set the mask of rules that are used |
|
/// |
|
void rule(rule_type v) |
|
{ |
|
mask_ = v; |
|
} |
|
|
|
/// |
|
/// Get the full_select property value - should segment include in the range |
|
/// values that not belong to specific \ref rule() or not. |
|
/// |
|
/// The default value is false. |
|
/// |
|
/// For example for \ref sentence %boundary with rule \ref sentence_term the segments |
|
/// of text "Hello! How\nare you?" are "Hello!\", "are you?" when full_select() is false |
|
/// because "How\n" is selected %as sentence by a rule spits the text by line feed. If full_select() |
|
/// is true the returned segments are "Hello! ", "How\nare you?" where "How\n" is joined with the |
|
/// following part "are you?" |
|
/// |
|
|
|
bool full_select() const |
|
{ |
|
return full_select_; |
|
} |
|
|
|
/// |
|
/// Set the full_select property value - should segment include in the range |
|
/// values that not belong to specific \ref rule() or not. |
|
/// |
|
/// The default value is false. |
|
/// |
|
/// For example for \ref sentence %boundary with rule \ref sentence_term the segments |
|
/// of text "Hello! How\nare you?" are "Hello!\", "are you?" when full_select() is false |
|
/// because "How\n" is selected %as sentence by a rule spits the text by line feed. If full_select() |
|
/// is true the returned segments are "Hello! ", "How\nare you?" where "How\n" is joined with the |
|
/// following part "are you?" |
|
/// |
|
|
|
void full_select(bool v) |
|
{ |
|
full_select_ = v; |
|
} |
|
|
|
private: |
|
friend class boundary_point_index<base_iterator>; |
|
typedef details::mapping<base_iterator> mapping_type; |
|
mapping_type map_; |
|
rule_type mask_; |
|
bool full_select_; |
|
}; |
|
|
|
/// |
|
/// \brief This class holds an index of \ref boundary_point "boundary points" and allows iterating |
|
/// over them. |
|
/// |
|
/// This class is provides \ref begin() and \ref end() member functions that return bidirectional iterators |
|
/// to the \ref boundary_point objects. |
|
/// |
|
/// It provides an option that affects selecting %boundary points according to different rules: |
|
/// using \ref rule(rule_type mask) member function. It allows to set a mask that select only specific |
|
/// types of %boundary points like \ref sentence_term. |
|
/// |
|
/// For example for a sentence %boundary analysis of a text "Hello! How\nare you?" when the default |
|
/// rule is used the %boundary points would be: |
|
/// |
|
/// - "|Hello! How\nare you?" |
|
/// - "Hello! |How\nare you?" |
|
/// - "Hello! How\n|are you?" |
|
/// - "Hello! How\nare you?|" |
|
/// |
|
/// However if \ref rule() is set to \ref sentence_term then the selected %boundary points would be: |
|
/// |
|
/// - "|Hello! How\nare you?" |
|
/// - "Hello! |How\nare you?" |
|
/// - "Hello! How\nare you?|" |
|
/// |
|
/// Such that a %boundary point defined by a line feed character would be ignored. |
|
/// |
|
/// This class allows to find a boundary_point according to the given iterator in range using \ref find() member |
|
/// function. |
|
/// |
|
/// \note |
|
/// - Even an empty text range [x,x) considered to have a one %boundary point x. |
|
/// - \a a and \a b points of the range [a,b) are always considered %boundary points |
|
/// regardless the rules used. |
|
/// - Changing any of the option \ref rule() or course re-indexing the text |
|
/// invalidates existing iterators and they can't be used any more. |
|
/// - boundary_point_index can be created from segment_index or other boundary_point_index that was created with |
|
/// same \ref boundary_type. This is very fast operation %as they shared same index |
|
/// and it does not require its regeneration. |
|
/// |
|
/// \see |
|
/// |
|
/// - \ref segment_index |
|
/// - \ref boundary_point |
|
/// - \ref segment |
|
/// |
|
|
|
|
|
template<typename BaseIterator> |
|
class boundary_point_index { |
|
public: |
|
/// |
|
/// The type of the iterator used to iterate over the original text |
|
/// |
|
typedef BaseIterator base_iterator; |
|
#ifdef BOOST_LOCALE_DOXYGEN |
|
/// |
|
/// The bidirectional iterator that iterates over \ref value_type objects. |
|
/// |
|
/// - The iterators may be invalidated by use of any non-const member function |
|
/// including but not limited to \ref rule(rule_type) member function. |
|
/// - The returned value_type object is valid %as long %as iterator points to it. |
|
/// So this following code is wrong %as t used after p was updated: |
|
/// \code |
|
/// boundary_point_index<some_iterator>::iterator p=index.begin(); |
|
/// boundary_point<some_iterator> &t = *p; |
|
/// ++p; |
|
/// rule_type r = t->rule(); |
|
/// \endcode |
|
/// |
|
typedef unspecified_iterator_type iterator; |
|
/// |
|
/// \copydoc iterator |
|
/// |
|
typedef unspecified_iterator_type const_iterator; |
|
#else |
|
typedef details::boundary_point_index_iterator<base_iterator> iterator; |
|
typedef details::boundary_point_index_iterator<base_iterator> const_iterator; |
|
#endif |
|
/// |
|
/// The type dereferenced by the \ref iterator and \ref const_iterator. It is |
|
/// an object that represents the selected \ref boundary_point "boundary point". |
|
/// |
|
typedef boundary_point<base_iterator> value_type; |
|
|
|
/// |
|
/// Default constructor. |
|
/// |
|
/// \note |
|
/// |
|
/// When this object is constructed by default it does not include a valid index, thus |
|
/// calling \ref begin(), \ref end() or \ref find() member functions would lead to undefined |
|
/// behavior |
|
/// |
|
boundary_point_index() : mask_(0xFFFFFFFFu) |
|
{ |
|
} |
|
|
|
/// |
|
/// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text |
|
/// in range [begin,end) using a rule \a mask for locale \a loc. |
|
/// |
|
boundary_point_index(boundary_type type, |
|
base_iterator begin, |
|
base_iterator end, |
|
rule_type mask, |
|
std::locale const &loc=std::locale()) |
|
: |
|
map_(type,begin,end,loc), |
|
mask_(mask) |
|
{ |
|
} |
|
/// |
|
/// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text |
|
/// in range [begin,end) selecting all possible %boundary points (full mask) for locale \a loc. |
|
/// |
|
boundary_point_index(boundary_type type, |
|
base_iterator begin, |
|
base_iterator end, |
|
std::locale const &loc=std::locale()) |
|
: |
|
map_(type,begin,end,loc), |
|
mask_(0xFFFFFFFFu) |
|
{ |
|
} |
|
|
|
/// |
|
/// Create a boundary_point_index from a \ref segment_index. It copies all indexing information |
|
/// and uses the default rule (all possible %boundary points) |
|
/// |
|
/// This operation is very cheap, so if you use boundary_point_index and segment_index on same text |
|
/// range it is much better to create one from another rather then indexing the same |
|
/// range twice. |
|
/// |
|
/// \note \ref rule() flags are not copied |
|
/// |
|
boundary_point_index(segment_index<base_iterator> const &other); |
|
/// |
|
/// Copy a boundary_point_index from a \ref segment_index. It copies all indexing information |
|
/// and keeps the current \ref rule() unchanged |
|
/// |
|
/// This operation is very cheap, so if you use boundary_point_index and segment_index on same text |
|
/// range it is much better to create one from another rather then indexing the same |
|
/// range twice. |
|
/// |
|
/// \note \ref rule() flags are not copied |
|
/// |
|
boundary_point_index const &operator=(segment_index<base_iterator> const &other); |
|
|
|
/// |
|
/// Create a new index for %boundary analysis \ref boundary_type "type" of the text |
|
/// in range [begin,end) for locale \a loc. |
|
/// |
|
/// \note \ref rule() remains unchanged. |
|
/// |
|
void map(boundary_type type,base_iterator begin,base_iterator end,std::locale const &loc=std::locale()) |
|
{ |
|
map_ = mapping_type(type,begin,end,loc); |
|
} |
|
|
|
/// |
|
/// Get the \ref iterator on the beginning of the %boundary points range. |
|
/// |
|
/// Preconditions: this boundary_point_index should have a mapping |
|
/// |
|
/// \note |
|
/// |
|
/// The returned iterator is invalidated by access to any non-const member functions of this object |
|
/// |
|
iterator begin() const |
|
{ |
|
return iterator(true,&map_,mask_); |
|
} |
|
|
|
/// |
|
/// Get the \ref iterator on the ending of the %boundary points range. |
|
/// |
|
/// Preconditions: this boundary_point_index should have a mapping |
|
/// |
|
/// \note |
|
/// |
|
/// The returned iterator is invalidated by access to any non-const member functions of this object |
|
/// |
|
iterator end() const |
|
{ |
|
return iterator(false,&map_,mask_); |
|
} |
|
|
|
/// |
|
/// Find a first valid %boundary point on a position \a p or following it. |
|
/// |
|
/// For example: For \ref word %boundary analysis of the text "to be or" |
|
/// |
|
/// - "|to be", would return %boundary point at "|to be", |
|
/// - "t|o be", would point to "to| be" |
|
/// |
|
/// Preconditions: the boundary_point_index should have a mapping and \a p should be valid iterator |
|
/// to the text in the mapped range. |
|
/// |
|
/// The returned iterator is invalidated by access to any non-const member functions of this object |
|
/// |
|
iterator find(base_iterator p) const |
|
{ |
|
return iterator(p,&map_,mask_); |
|
} |
|
|
|
/// |
|
/// Get the mask of rules that are used |
|
/// |
|
rule_type rule() const |
|
{ |
|
return mask_; |
|
} |
|
/// |
|
/// Set the mask of rules that are used |
|
/// |
|
void rule(rule_type v) |
|
{ |
|
mask_ = v; |
|
} |
|
|
|
private: |
|
|
|
friend class segment_index<base_iterator>; |
|
typedef details::mapping<base_iterator> mapping_type; |
|
mapping_type map_; |
|
rule_type mask_; |
|
}; |
|
|
|
/// \cond INTERNAL |
|
template<typename BaseIterator> |
|
segment_index<BaseIterator>::segment_index(boundary_point_index<BaseIterator> const &other) : |
|
map_(other.map_), |
|
mask_(0xFFFFFFFFu), |
|
full_select_(false) |
|
{ |
|
} |
|
|
|
template<typename BaseIterator> |
|
boundary_point_index<BaseIterator>::boundary_point_index(segment_index<BaseIterator> const &other) : |
|
map_(other.map_), |
|
mask_(0xFFFFFFFFu) |
|
{ |
|
} |
|
|
|
template<typename BaseIterator> |
|
segment_index<BaseIterator> const &segment_index<BaseIterator>::operator=(boundary_point_index<BaseIterator> const &other) |
|
{ |
|
map_ = other.map_; |
|
return *this; |
|
} |
|
|
|
template<typename BaseIterator> |
|
boundary_point_index<BaseIterator> const &boundary_point_index<BaseIterator>::operator=(segment_index<BaseIterator> const &other) |
|
{ |
|
map_ = other.map_; |
|
return *this; |
|
} |
|
/// \endcond |
|
|
|
typedef segment_index<std::string::const_iterator> ssegment_index; ///< convenience typedef |
|
typedef segment_index<std::wstring::const_iterator> wssegment_index; ///< convenience typedef |
|
#ifdef BOOST_HAS_CHAR16_T |
|
typedef segment_index<std::u16string::const_iterator> u16ssegment_index;///< convenience typedef |
|
#endif |
|
#ifdef BOOST_HAS_CHAR32_T |
|
typedef segment_index<std::u32string::const_iterator> u32ssegment_index;///< convenience typedef |
|
#endif |
|
|
|
typedef segment_index<char const *> csegment_index; ///< convenience typedef |
|
typedef segment_index<wchar_t const *> wcsegment_index; ///< convenience typedef |
|
#ifdef BOOST_HAS_CHAR16_T |
|
typedef segment_index<char16_t const *> u16csegment_index; ///< convenience typedef |
|
#endif |
|
#ifdef BOOST_HAS_CHAR32_T |
|
typedef segment_index<char32_t const *> u32csegment_index; ///< convenience typedef |
|
#endif |
|
|
|
typedef boundary_point_index<std::string::const_iterator> sboundary_point_index;///< convenience typedef |
|
typedef boundary_point_index<std::wstring::const_iterator> wsboundary_point_index;///< convenience typedef |
|
#ifdef BOOST_HAS_CHAR16_T |
|
typedef boundary_point_index<std::u16string::const_iterator> u16sboundary_point_index;///< convenience typedef |
|
#endif |
|
#ifdef BOOST_HAS_CHAR32_T |
|
typedef boundary_point_index<std::u32string::const_iterator> u32sboundary_point_index;///< convenience typedef |
|
#endif |
|
|
|
typedef boundary_point_index<char const *> cboundary_point_index; ///< convenience typedef |
|
typedef boundary_point_index<wchar_t const *> wcboundary_point_index; ///< convenience typedef |
|
#ifdef BOOST_HAS_CHAR16_T |
|
typedef boundary_point_index<char16_t const *> u16cboundary_point_index;///< convenience typedef |
|
#endif |
|
#ifdef BOOST_HAS_CHAR32_T |
|
typedef boundary_point_index<char32_t const *> u32cboundary_point_index;///< convenience typedef |
|
#endif |
|
|
|
|
|
|
|
} // boundary |
|
|
|
} // locale |
|
} // boost |
|
|
|
/// |
|
/// \example boundary.cpp |
|
/// Example of using segment_index |
|
/// \example wboundary.cpp |
|
/// Example of using segment_index over wide strings |
|
/// |
|
|
|
#ifdef BOOST_MSVC |
|
#pragma warning(pop) |
|
#endif |
|
|
|
#endif |
|
// vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
|
|
|