diff --git a/doc/bad_expression.html b/doc/bad_expression.html new file mode 100644 index 00000000..aee75368 --- /dev/null +++ b/doc/bad_expression.html @@ -0,0 +1,81 @@ + + +
+
+ |
+
+ Boost.Regex+class regex_error+ |
+
+ |
+
#include <boost/pattern_except.hpp>
+The class regex_error
defines the type of objects thrown as
+ exceptions to report errors during the conversion from a string representing a
+ regular expression to a finite state machine.
+namespace boost{ + +class regex_error : public std::runtime_error +{ +public: + explicit regex_error(const std::string& s, regex_constants::error_type err, std::ptrdiff_t pos); + explicit regex_error(boost::regex_constants::error_type err); + boost::regex_constants::error_type code()const; + std::ptrdiff_t position()const; +}; + +typedef regex_error bad_pattern; // for backwards compatibility +typedef regex_error bad_expression; // for backwards compatibility + +} // namespace boost ++
+regex_error(const std::string& s, regex_constants::error_type err, std::ptrdiff_t pos); +regex_error(boost::regex_constants::error_type err);+
Effects: Constructs an object of class regex_error
.
+boost::regex_constants::error_type code()const;+
Effects: returns the error code that represents parsing error that occurred.
++std::ptrdiff_t position()const;+
Effects: returns the location in the expression where parsing stopped.
+Footnotes: the choice of std::runtime_error
as the base class for
+ regex_error
is moot; depending upon how the library is used
+ exceptions may be either logic errors (programmer supplied expressions) or run
+ time errors (user supplied expressions). The library previously used bad_pattern
+ and bad_expression
for errors, these have been replaced by the
+ single class regex_error
to keep the library in synchronization
+ with the standardization proposal.
Revised + + 24 June 2004 +
+© Copyright John Maddock 1998- + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + + diff --git a/doc/basic_regex.html b/doc/basic_regex.html new file mode 100644 index 00000000..16774739 --- /dev/null +++ b/doc/basic_regex.html @@ -0,0 +1,906 @@ + + + +
+ |
+
+ Boost.Regex+basic_regex+ |
+
+ |
+
+#include <boost/regex.hpp> ++
The template class basic_regex encapsulates regular expression parsing + and compilation. The class takes two template parameters:
+charT: determines the character type, i.e. either char or + wchar_t; see charT concept.
+traits: determines the behavior of the character type, for + example which character class names are recognized. A default traits class is + provided: regex_traits<charT>. See + also traits concept.
+For ease of use there are two typedefs that define the two standard basic_regex + instances, unless you want to use custom traits classes or non-standard + character types, you won't need to use anything other than these:
++namespace boost{ +template <class charT, class traits = regex_traits<charT> > +class basic_regex; +typedef basic_regex<char> regex; +typedef basic_regex<wchar_t> wregex; +} ++
The definition of basic_regex follows: it is based very closely on class + basic_string, and fulfils the requirements for a constant-container of charT.
++namespace boost{ + +template <class charT, class traits = regex_traits<charT> > +class basic_regex { + public: + // types: + typedef charT value_type; + typedef implementation-specific const_iterator; + typedef const_iterator iterator; + typedef charT& reference; + typedef const charT& const_reference; + typedef std::ptrdiff_t difference_type; + typedef std::size_t size_type; + typedef regex_constants::syntax_option_type flag_type; + typedef typename traits::locale_type locale_type; + + // constants: + // main option selection: + static const regex_constants::syntax_option_type normal = regex_constants::normal; + static const regex_constants::syntax_option_type ECMAScript = normal; + static const regex_constants::syntax_option_type JavaScript = normal; + static const regex_constants::syntax_option_type JScript = normal; + static const regex_constants::syntax_option_type basic = regex_constants::basic; + static const regex_constants::syntax_option_type extended = regex_constants::extended; + static const regex_constants::syntax_option_type awk = regex_constants::awk; + static const regex_constants::syntax_option_type grep = regex_constants::grep; + static const regex_constants::syntax_option_type egrep = regex_constants::egrep; + static const regex_constants::syntax_option_type sed = basic = regex_constants::sed; + static const regex_constants::syntax_option_type perl = regex_constants::perl; + static const regex_constants::syntax_option_type literal = regex_constants::literal; + // modifiers specific to perl expressions: + static const regex_constants::syntax_option_type no_mod_m = regex_constants::no_mod_m; + static const regex_constants::syntax_option_type no_mod_s = regex_constants::no_mod_s; + static const regex_constants::syntax_option_type mod_s = regex_constants::mod_s; + static const regex_constants::syntax_option_type mod_x = regex_constants::mod_x; + // modifiers specific to POSIX basic expressions: + static const regex_constants::syntax_option_type bk_plus_qm = regex_constants::bk_plus_qm; + static const regex_constants::syntax_option_type bk_vbar = regex_constants::bk_vbar + static const regex_constants::syntax_option_type no_char_classes = regex_constants::no_char_classes + static const regex_constants::syntax_option_type no_intervals = regex_constants::no_intervals + // common modifiers: + static const regex_constants::syntax_option_type nosubs = regex_constants::nosubs; + static const regex_constants::syntax_option_type optimize = regex_constants::optimize; + static const regex_constants::syntax_option_type collate = regex_constants::collate; + static const regex_constants::syntax_option_type newline_alt = regex_constants::newline_alt; + static const regex_constants::syntax_option_type no_except = regex_constants::newline_alt; + + // construct/copy/destroy: + explicit basic_regex (); + explicit basic_regex(const charT* p, flag_type f = regex_constants::normal); + basic_regex(const charT* p1, const charT* p2, flag_type f = regex_constants::normal); + basic_regex(const charT* p, size_type len, flag_type f); + basic_regex(const basic_regex&); + template <class ST, class SA> + explicit basic_regex(const basic_string<charT, ST, SA>& p, flag_type f = regex_constants::normal); + template <class InputIterator> + basic_regex(InputIterator first, InputIterator last, flag_type f = regex_constants::normal); + + ~basic_regex(); + basic_regex& operator=(const basic_regex&); + basic_regex& operator= (const charT* ptr); + template <class ST, class SA> + basic_regex& operator= (const basic_string<charT, ST, SA>& p); + // iterators: + const_iterator begin() const; + const_iterator end() const; + // capacity: + size_type size() const; + size_type max_size() const; + bool empty() const; + unsigned mark_count()const; + // + // modifiers: + basic_regex& assign(const basic_regex& that); + basic_regex& assign(const charT* ptr, flag_type f = regex_constants::normal); + basic_regex& assign(const charT* ptr, unsigned int len, flag_type f); + template <class string_traits, class A> + basic_regex& assign(const basic_string<charT, string_traits, A>& s, + flag_type f = regex_constants::normal); + template <class InputIterator> + basic_regex& assign(InputIterator first, InputIterator last, + flag_type f = regex_constants::normal); + + // const operations: + flag_type flags() const; + int status()const; + basic_string<charT> str() const; + int compare(basic_regex&) const; + // locale: + locale_type imbue(locale_type loc); + locale_type getloc() const; + // swap + void swap(basic_regex&) throw(); +}; + +template <class charT, class traits> +bool operator == (const basic_regex<charT, traits>& lhs, + const basic_regex<charT, traits>& rhs); +template <class charT, class traits> +bool operator != (const basic_regex<charT, traits>& lhs, + const basic_regex<charT, traits>& rhs); +template <class charT, class traits> +bool operator < (const basic_regex<charT, traits>& lhs, + const basic_regex<charT, traits>& rhs); +template <class charT, class traits> +bool operator <= (const basic_regex<charT, traits>& lhs, + const basic_regex<charT, traits>& rhs); +template <class charT, class traits> +bool operator >= (const basic_regex<charT, traits>& lhs, + const basic_regex<charT, traits>& rhs); +template <class charT, class traits> +bool operator > (const basic_regex<charT, traits>& lhs, + const basic_regex<charT, traits>& rhs); + +template <class charT, class io_traits, class re_traits> +basic_ostream<charT, io_traits>& + operator << (basic_ostream<charT, io_traits>& os, + const basic_regex<charT, re_traits>& e); + +template <class charT, class traits> +void swap(basic_regex<charT, traits>& e1, + basic_regex<charT, traits>& e2); + +typedef basic_regex<char> regex; +typedef basic_regex<wchar_t> wregex; + +} // namespace boost ++
Class basic_regex has the following public member functions:
++// main option selection: +static const regex_constants::syntax_option_type normal = regex_constants::normal; +static const regex_constants::syntax_option_type ECMAScript = normal; +static const regex_constants::syntax_option_type JavaScript = normal; +static const regex_constants::syntax_option_type JScript = normal; +static const regex_constants::syntax_option_type basic = regex_constants::basic; +static const regex_constants::syntax_option_type extended = regex_constants::extended; +static const regex_constants::syntax_option_type awk = regex_constants::awk; +static const regex_constants::syntax_option_type grep = regex_constants::grep; +static const regex_constants::syntax_option_type egrep = regex_constants::egrep; +static const regex_constants::syntax_option_type sed = regex_constants::sed; +static const regex_constants::syntax_option_type perl = regex_constants::perl; +static const regex_constants::syntax_option_type literal = regex_constants::literal; +// modifiers specific to perl expressions: +static const regex_constants::syntax_option_type no_mod_m = regex_constants::no_mod_m; +static const regex_constants::syntax_option_type no_mod_s = regex_constants::no_mod_s; +static const regex_constants::syntax_option_type mod_s = regex_constants::mod_s; +static const regex_constants::syntax_option_type mod_x = regex_constants::mod_x; +// modifiers specific to POSIX basic expressions: +static const regex_constants::syntax_option_type bk_plus_qm = regex_constants::bk_plus_qm; +static const regex_constants::syntax_option_type bk_vbar = regex_constants::bk_vbar +static const regex_constants::syntax_option_type no_char_classes = regex_constants::no_char_classes +static const regex_constants::syntax_option_type no_intervals = regex_constants::no_intervals +// common modifiers: +static const regex_constants::syntax_option_type nosubs = regex_constants::nosubs; +static const regex_constants::syntax_option_type optimize = regex_constants::optimize; +static const regex_constants::syntax_option_type collate = regex_constants::collate; +static const regex_constants::syntax_option_type newline_alt = regex_constants::newline_alt; ++
The static constant members are provided as synonyms for the constants declared
+ in namespace boost::regex_constants
; for each constant of type
+ syntax_option_type
declared in namespace boost::regex_constants
+ then a constant with the same name, type and value is declared within the scope
+ of basic_regex
.
basic_regex(); ++
Effects: Constructs an object of class basic_regex
. The
+ postconditions of this function are indicated in the table:
+ Element + |
+
+ Value + |
+
+ empty() + |
+
+ true + |
+
+ size() + |
+
+ 0 + |
+
+ str() + |
+
+ basic_string<charT>() + |
+
+
basic_regex(const charT* p, flag_type f = regex_constants::normal); + +
Requires: p shall not be a null pointer.
+Throws: bad_expression
if p is not a valid regular
+ expression, unless the flag no_except is set in f.
Effects: Constructs an object of class basic_regex
; the
+ object's internal finite state machine is constructed from the regular
+ expression contained in the null-terminated string p, and interpreted
+ according to the option flags specified
+ in f. The postconditions of this function are indicated in the table:
+ Element + |
+
+ Value + |
+
+ empty() + |
+
+ false + |
+
+ size() + |
+
+ char_traits<charT>::length(p) + |
+
+ str() + |
+
+ basic_string<charT>(p) + |
+
+ flags() + |
+
+ f + |
+
+ mark_count() + |
+
+ The number of marked sub-expressions within the expression. + |
+
+ ++
basic_regex(const charT* p1, const charT* p2, flag_type f = regex_constants::normal);+
Requires: p1 and p2 are not null pointers, p1 < p2
.
Throws: bad_expression
if [p1,p2) is not a valid regular
+ expression, unless the flag no_except is set in f.
Effects: Constructs an object of class basic_regex
; the
+ object's internal finite state machine is constructed from the regular
+ expression contained in the sequence of characters [p1,p2), and interpreted
+ according the option flags specified in f.
+ The postconditions of this function are indicated in the table:
+ Element + |
+
+ Value + |
+
+ empty() + |
+
+ false + |
+
+ size() + |
+
+ std::distance(p1,p2) + |
+
+ str() + |
+
+ basic_string<charT>(p1,p2) + |
+
+ flags() + |
+
+ f + |
+
+ mark_count() + |
+
+ The number of marked sub-expressions within the expression. + |
+
+ ++
basic_regex(const charT* p, size_type len, flag_type f); ++
Requires: p shall not be a null pointer, len < max_size()
.
Throws: bad_expression
if p is not a valid regular
+ expression, unless the flag no_except is set in f.
Effects: Constructs an object of class basic_regex
; the
+ object's internal finite state machine is constructed from the regular
+ expression contained in the sequence of characters [p, p+len), and interpreted
+ according the option flags specified in f.
+ The postconditions of this function are indicated in the table:
+ Element + |
+
+ Value + |
+
+ empty() + |
+
+ false + |
+
+ size() + |
+
+ len + |
+
+ str() + |
+
+ basic_string<charT>(p, len) + |
+
+ flags() + |
+
+ f + |
+
+ mark_count() + |
+
+ The number of marked sub-expressions within the expression. + |
+
+
basic_regex(const basic_regex& e); +
Effects: Constructs an object of class basic_regex
as a
+ copy of the object e. The postconditions of this function are indicated
+ in the table:
+ Element + |
+
+ Value + |
+
+ empty() + |
+
+ e.empty() + |
+
+ size() + |
+
+ e.size() + |
+
+ str() + |
+
+ e.str() + |
+
+ flags() + |
+
+ e.flags() + |
+
+ mark_count() + |
+
+ e.mark_count() + |
+
+
+template <class ST, class SA> +basic_regex(const basic_string<charT, ST, SA>& s, flag_type f = regex_constants::normal); +
Throws: bad_expression
if s is not a valid regular
+ expression, unless the flag no_except is set in f.
Effects: Constructs an object of class basic_regex
; the
+ object's internal finite state machine is constructed from the regular
+ expression contained in the string s, and interpreted according to the
+ option flags specified in f. The postconditions of this function
+ are indicated in the table:
+ Element + |
+
+ Value + |
+
+ empty() + |
+
+ false + |
+
+ size() + |
+
+ s.size() + |
+
+ str() + |
+
+ s + |
+
+ flags() + |
+
+ f + |
+
+ mark_count() + |
+
+ The number of marked sub-expressions within the expression. + |
+
+
+template <class ForwardIterator> +basic_regex(ForwardIterator first, ForwardIterator last, flag_type f = regex_constants::normal); +
Throws: bad_expression
if the sequence [first, last)
+ is not a valid regular expression, unless the flag no_except is set in f.
Effects: Constructs an object of class basic_regex
; the
+ object's internal finite state machine is constructed from the regular
+ expression contained in the sequence of characters [first, last), and
+ interpreted according to the option flags
+ specified in f. The postconditions of this function are indicated in the
+ table:
+ Element + |
+
+ Value + |
+
+ empty() + |
+
+ false + |
+
+ size() + |
+
+ distance(first,last) + |
+
+ str() + |
+
+ basic_string<charT>(first,last) + |
+
+ flags() + |
+
+ f + |
+
+ mark_count() + |
+
+ The number of marked sub-expressions within the expression. + |
+
+basic_regex& operator=(const basic_regex& e); ++
Effects: Returns the result of assign(e.str(), e.flags())
.
basic_regex& operator=(const charT* ptr); ++
Requires: p shall not be a null pointer.
+Effects: Returns the result of assign(ptr)
.
+template <class ST, class SA> +basic_regex& operator=(const basic_string<charT, ST, SA>& p); ++
Effects: Returns the result of assign(p)
.
+const_iterator begin() const; ++
Effects: Returns a starting iterator to a sequence of characters + representing the regular expression.
++const_iterator end() const; ++
Effects: Returns termination iterator to a sequence of characters + representing the regular expression.
++size_type size() const; ++
Effects: Returns the length of the sequence of characters representing + the regular expression.
++size_type max_size() const; ++
Effects: Returns the maximum length of the sequence of characters + representing the regular expression.
++bool empty() const; ++
Effects: Returns true if the object does not contain a valid + regular expression, otherwise false.
+unsigned mark_count() const; ++
Effects: Returns the number of marked sub-expressions within the regular + expresion.
++basic_regex& assign(const basic_regex& that); ++
Effects: Returns assign(that.str(), that.flags())
.
+basic_regex& assign(const charT* ptr, flag_type f = regex_constants::normal); ++
Effects: Returns assign(string_type(ptr), f)
.
basic_regex& assign(const charT* ptr, unsigned int len, flag_type f);+
Effects: Returns assign(string_type(ptr, len), f)
.
template <class string_traits, class A> +basic_regex& assign(const basic_string<charT, string_traits, A>& s, + flag_type f = regex_constants::normal); ++
Throws: bad_expression
if s is not a valid regular
+ expression, unless the flag no_except is set in f.
Returns: *this
.
Effects: Assigns the regular expression contained in the string s, + interpreted according the option flags specified + in f. The postconditions of this function are indicated in the table:
+
+ Element + |
+
+ Value + |
+
+ empty() + |
+
+ false + |
+
+ size() + |
+
+ s.size() + |
+
+ str() + |
+
+ s + |
+
+ flags() + |
+
+ f + |
+
+ mark_count() + |
+
+ The number of marked sub-expressions within the expression. + |
+
+ ++
template <class InputIterator> +basic_regex& assign(InputIterator first, InputIterator last, + flag_type f = regex_constants::normal); ++
Requires: The type InputIterator corresponds to the Input Iterator + requirements (24.1.1).
+Effects: Returns assign(string_type(first, last), f)
.
flag_type flags() const; ++
Effects: Returns a copy of the regular expression syntax flags that were
+ passed to the object's constructor, or the last call to assign.
+ int status() const;
+
+ Effects: Returns zero if the expression contains a valid + regular expression, otherwise an error code. + This member function is retained for use in environments that cannot use + exception handling.
+basic_string<charT> str() const; ++
Effects: Returns a copy of the character sequence passed to the object's
+ constructor, or the last call to assign.
int compare(basic_regex& e)const; ++
Effects: If flags() == e.flags()
then returns str().compare(e.str())
,
+ otherwise returns flags() - e.flags()
.
locale_type imbue(locale_type l); ++
Effects: Returns the result of traits_inst.imbue(l)
where
+ traits_inst
is a (default initialized) instance of the template
+ parameter traits
stored within the object. Calls to imbue
+ invalidate any currently contained regular expression.
Postcondition: empty() == true
.
+locale_type getloc() const; ++
Effects: Returns the result of traits_inst.getloc()
where
+ traits_inst
is a (default initialized) instance of the template
+ parameter traits
stored within the object.
+void swap(basic_regex& e) throw(); ++
Effects: Swaps the contents of the two regular expressions.
+Postcondition: *this
contains the characters that were in e,
+ e contains the regular expression that was in *this
.
Complexity: constant time.
+Comparisons between basic_regex objects are provided on an experimental basis: + please note that these are likely to be removed from the standard library + proposal, so use with care if you are writing portable code.
++template <class charT, class traits> +bool operator == (const basic_regex<charT, traits>& lhs, + const basic_regex<charT, traits>& rhs); ++
Effects: Returns lhs.compare(rhs) == 0
.
+template <class charT, class traits> +bool operator != (const basic_regex<charT, traits>& lhs, + const basic_regex<charT, traits>& rhs); ++
Effects: Returns lhs.compare(rhs) != 0
.
+template <class charT, class traits> +bool operator < (const basic_regex<charT, traits>& lhs, + const basic_regex<charT, traits>& rhs); ++
Effects: Returns lhs.compare(rhs) < 0
.
+template <class charT, class traits> +bool operator <= (const basic_regex<charT, traits>& lhs, + const basic_regex<charT, traits>& rhs); ++
Effects: Returns lhs.compare(rhs) <= 0
.
+template <class charT, class traits> +bool operator >= (const basic_regex<charT, traits>& lhs, + const basic_regex<charT, traits>& rhs); ++
Effects: Returns lhs.compare(rhs) >= 0
.
+template <class charT, class traits> +bool operator > (const basic_regex<charT, traits>& lhs, + const basic_regex<charT, traits>& rhs); ++
Effects: Returns lhs.compare(rhs) > 0
.
The basic_regex stream inserter is provided on an experimental basis, and + outputs the textual representation of the expression to the stream:
++template <class charT, class io_traits, class re_traits> +basic_ostream<charT, io_traits>& + operator << (basic_ostream<charT, io_traits>& os + const basic_regex<charT, re_traits>& e); ++
Effects: Returns (os << e.str()).
++template <class charT, class traits> +void swap(basic_regex<charT, traits>& lhs, + basic_regex<charT, traits>& rhs); ++
Effects: calls lhs.swap(rhs)
.
Revised 7 Aug + + 2004 +
+© Copyright John Maddock 1998- + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/captures.html b/doc/captures.html new file mode 100644 index 00000000..fe0232f8 --- /dev/null +++ b/doc/captures.html @@ -0,0 +1,254 @@ + + + ++
+ |
+
+ Boost.Regex+Understanding Captures+ |
+
+ |
+
Captures are the iterator ranges that are "captured" by marked sub-expressions + as a regular expression gets matched. Each marked sub-expression can + result in more than one capture, if it is matched more than once. This + document explains how captures and marked sub-expressions in Boost.Regex are + represented and accessed.
+Every time a Perl regular expression contains a parenthesis group (), it spits + out an extra field, known as a marked sub-expression, for example the + expression:
+(\w+)\W+(\w+)+
+ Has two marked sub-expressions (known as $1 and $2 respectively), in addition + the complete match is known as $&, everything before the first match as $`, + and everything after the match as $'. So if the above expression is + searched for within "@abc def--", then we obtain:
++++
+
+ ++ ++ +$`
+"@" ++ +$& +"abc def" ++ +$1 +"abc" ++ +$2 +"def" ++ +$' +"--" +
In Boost.regex all these are accessible via the match_results + class that gets filled in when calling one of the matching algorithms (regex_search, + regex_match, or regex_iterator). + So given:
+boost::match_results<IteratorType> m;+
The Perl and Boost.Regex equivalents are as follows:
++++
+
+ ++ +Perl +Boost.Regex ++ +$` +m.prefix() ++ +$& +m[0] ++ +$n +m[n] ++ +$' +m.suffix() +
+
In Boost.Regex each sub-expression match is represented by a + sub_match object, this is basically just a pair of iterators denoting + the start and end possition of the sub-expression match, but there are some + additional operators provided so that objects of type sub_match behave a lot + like a std::basic_string: for example they are implicitly + convertible to a basic_string, they can be compared + to a string, added to a string, or + streamed out to an output stream.
+When a regular expression match is found there is no need for all of the marked + sub-expressions to have participated in the match, for example the expression:
+(abc)|(def)
+can match either $1 or $2, but never both at the same time. In + Boost.Regex you can determine which sub-expressions matched by accessing the + sub_match::matched data member.
+When a marked sub-expression is repeated, then the sub-expression gets + "captured" multiple times, however normally only the final capture is + available, for example if
+(?:(\w+)\W+)++
is matched against
+one fine day+
Then $1 will contain the string "day", and all the previous captures will have + been forgotten.
+However, Boost.Regex has an experimental feature that allows all the capture + information to be retained - this is accessed either via the + match_results::captures member function or the sub_match::captures + member function. These functions return a container that contains a + sequence of all the captures obtained during the regular expression + matching. The following example program shows how this information may be + used:
+#include <boost/regex.hpp> +#include <iostream> + + +void print_captures(const std::string& regx, const std::string& text) +{ + boost::regex e(regx); + boost::smatch what; + std::cout << "Expression: \"" << regx << "\"\n"; + std::cout << "Text: \"" << text << "\"\n"; + if(boost::regex_match(text, what, e, boost::match_extra)) + { + unsigned i, j; + std::cout << "** Match found **\n Sub-Expressions:\n"; + for(i = 0; i < what.size(); ++i) + std::cout << " $" << i << " = \"" << what[i] << "\"\n"; + std::cout << " Captures:\n"; + for(i = 0; i < what.size(); ++i) + { + std::cout << " $" << i << " = {"; + for(j = 0; j < what.captures(i).size(); ++j) + { + if(j) + std::cout << ", "; + else + std::cout << " "; + std::cout << "\"" << what.captures(i)[j] << "\""; + } + std::cout << " }\n"; + } + } + else + { + std::cout << "** No Match found **\n"; + } +} + +int main(int , char* []) +{ + print_captures("(([[:lower:]]+)|([[:upper:]]+))+", "aBBcccDDDDDeeeeeeee"); + print_captures("(.*)bar|(.*)bah", "abcbar"); + print_captures("(.*)bar|(.*)bah", "abcbah"); + print_captures("^(?:(\\w+)|(?>\\W+))*$", "now is the time for all good men to come to the aid of the party"); + return 0; +}+
Which produces the following output:
+Expression: "(([[:lower:]]+)|([[:upper:]]+))+" +Text: "aBBcccDDDDDeeeeeeee" +** Match found ** + Sub-Expressions: + $0 = "aBBcccDDDDDeeeeeeee" + $1 = "eeeeeeee" + $2 = "eeeeeeee" + $3 = "DDDDD" + Captures: + $0 = { "aBBcccDDDDDeeeeeeee" } + $1 = { "a", "BB", "ccc", "DDDDD", "eeeeeeee" } + $2 = { "a", "ccc", "eeeeeeee" } + $3 = { "BB", "DDDDD" } +Expression: "(.*)bar|(.*)bah" +Text: "abcbar" +** Match found ** + Sub-Expressions: + $0 = "abcbar" + $1 = "abc" + $2 = "" + Captures: + $0 = { "abcbar" } + $1 = { "abc" } + $2 = { } +Expression: "(.*)bar|(.*)bah" +Text: "abcbah" +** Match found ** + Sub-Expressions: + $0 = "abcbah" + $1 = "" + $2 = "abc" + Captures: + $0 = { "abcbah" } + $1 = { } + $2 = { "abc" } +Expression: "^(?:(\w+)|(?>\W+))*$" +Text: "now is the time for all good men to come to the aid of the party" +** Match found ** + Sub-Expressions: + $0 = "now is the time for all good men to come to the aid of the party" + $1 = "party" + Captures: + $0 = { "now is the time for all good men to come to the aid of the party" } + $1 = { "now", "is", "the", "time", "for", "all", "good", "men", "to", "come", "to", "the", "aid", "of", "the", "party" } ++
Unfortunately enabling this feature has an impact on performance (even if you + don't use it), and a much bigger impact if you do use it, therefore to use this + feature you need to:
++
Revised + + 12 Dec 2003 +
+© Copyright John Maddock + 2003
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/character_class_names.html b/doc/character_class_names.html new file mode 100644 index 00000000..576e45d0 --- /dev/null +++ b/doc/character_class_names.html @@ -0,0 +1,326 @@ + + + ++
+ |
+
+ Boost.Regex+Character Class Names.+ |
+
+ |
+
The following character class names are always supported by Boost.Regex:
++
Name | +POSIX-standard name | +Description | +
alnum | +Yes | +Any alpha-numeric character. | +
alpha | +Yes | +Any alphabetic character. | +
blank | +Yes | +Any whitespace character that is not a line separator. | +
cntrl | +Yes | +Any control character. | +
d | +No | +Any decimal digit | +
digit | +Yes | +Any decimal digit. | +
graph | +Yes | +Any graphical character. | +
l | +No | +Any lower case character. | +
lower | +Yes | +Any lower case character. | +
Yes | +Any printable character. | +|
punct | +Yes | +Any punctuation character. | +
s | +No | +Any whitespace character. | +
space | +Yes | +Any whitespace character. | +
unicode | +No | +Any extended character whose code point is above 255 in value. | +
u | +No | +Any upper case character. | +
upper | +Yes | +Any upper case character. | +
w | +No | +Any word character (alphanumeric characters plus the underscore). | +
word | +No | +Any word character (alphanumeric characters plus the underscore). | +
xdigit | +Yes | +Any hexadecimal digit character. | +
+
The following character classes are only supported by Unicode + Regular Expressions: that is those that use the u32regex type. The + names used are the same as those from + Chapter 4 of the Unicode standard.
+Short Name | +Long Name | +
+ | +ASCII | +
+ | +Any | +
+ | +Assigned | +
C* | +Other | +
Cc | +Control | +
Cf | +Format | +
Cn | +Not Assigned | +
Co | +Private Use | +
Cs | +Surrogate | +
L* | +Letter | +
Ll | +Lowercase Letter | +
Lm | +Modifier Letter | +
Lo | +Other Letter | +
Lt | +Titlecase | +
Lu | +Uppercase Letter | +
M* | +Mark | +
Mc | +Spacing Combining Mark | +
Me | +Enclosing Mark | +
Mn | +Non-Spacing Mark | +
N* | +Number | +
Nd | +Decimal Digit Number | +
Nl | +Letter Number | +
No | +Other Number | +
P* | +Punctuation | +
Pc | +Connector Punctuation | +
Pd | +Dash Punctuation | +
Pe | +Close Punctuation | +
Pf | +Final Punctuation | +
Pi | +Initial Punctuation | +
Po | +Other Punctuation | +
Ps | +Open Punctuation | +
S* | +Symbol | +
Sc | +Currency Symbol | +
Sk | +Modifier Symbol | +
Sm | +Math Symbol | +
So | +Other Symbol | +
Z* | +Separator | +
Zl | +Line Separator | +
Zp | +Paragraph Separator | +
Zs | +Space Separator | +
Revised + + 10 Jan 2005 +
+© Copyright John Maddock 2004-5
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/collating_names.html b/doc/collating_names.html new file mode 100644 index 00000000..c553f1ed --- /dev/null +++ b/doc/collating_names.html @@ -0,0 +1,368 @@ + + + ++
+ |
+
+ Boost.Regex+Collating Element Names+ |
+
+ |
+
The following are treated as valid digraphs when used as a collating name:
+"ae", "Ae", "AE", "ch", "Ch", "CH", "ll", "Ll", "LL", "ss", "Ss", "SS", "nj", + "Nj", "NJ", "dz", "Dz", "DZ", "lj", "Lj", "LJ".
+The following symbolic names are recognised as valid collating element names, + in addition to any single character:
++
Name | +Character | +
NUL | +\x00 | +
SOH | +\x01 | +
STX | +\x02 | +
ETX | +\x03 | +
EOT | +\x04 | +
ENQ | +\x05 | +
ACK | +\x06 | +
alert | +\x07 | +
backspace | +\x08 | +
tab | +\t | +
newline | +\n | +
vertical-tab | +\v | +
form-feed | +\f | +
carriage-return | +\r | +
SO | +\xE | +
SI | +\xF | +
DLE | +\x10 | +
DC1 | +\x11 | +
DC2 | +\x12 | +
DC3 | +\x13 | +
DC4 | +\x14 | +
NAK | +\x15 | +
SYN | +\x16 | +
ETB | +\x17 | +
CAN | +\x18 | +
EM | +\x19 | +
SUB | +\x1A | +
ESC | +\x1B | +
IS4 | +\x1C | +
IS3 | +\x1D | +
IS2 | +\x1E | +
IS1 | +\x1F | +
space | +\x20 | +
exclamation-mark | +! | +
quotation-mark | +" | +
number-sign | +# | +
dollar-sign | +$ | +
percent-sign | +% | +
ampersand | +& | +
apostrophe | +' | +
left-parenthesis | +( | +
right-parenthesis | +) | +
asterisk | +* | +
plus-sign | ++ | +
comma | +, | +
hyphen | +- | +
period | +. | +
slash | +/ | +
zero | +0 | +
one | +1 | +
two | +2 | +
three | +3 | +
four | +4 | +
five | +5 | +
six | +6 | +
seven | +7 | +
eight | +8 | +
nine | +9 | +
colon | +: | +
semicolon | +; | +
less-than-sign | +< | +
equals-sign | += | +
greater-than-sign | +> | +
question-mark | +? | +
commercial-at | +@ | +
left-square-bracket | +[ | +
backslash | +\ | +
right-square-bracket | +] | +
circumflex | +~ | +
underscore | +_ | +
grave-accent | +` | +
left-curly-bracket | +{ | +
vertical-line | +| | +
right-curly-bracket | +} | +
tilde | +~ | +
DEL | +\x7F | +
+
When using Unicode aware regular expressions (with + the u32regex type), all the normal symbolic names for Unicode + characters (those given in Unidata.txt) are recognised.
++
Revised 12 Jan 2005 +
+© Copyright John Maddock 2004-2005
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/concepts.html b/doc/concepts.html new file mode 100644 index 00000000..ed598933 --- /dev/null +++ b/doc/concepts.html @@ -0,0 +1,453 @@ + + + ++
+ |
+
+ Boost.Regex+Concepts+ |
+
+ |
+
Type charT used a template argument to class template + basic_regex, must have a trivial default constructor, copy constructor, + assignment operator, and destructor. In addition the following + requirements must be met for objects; c of type charT, c1 and c2 of type charT + const, and i of type int:
++
Expression | +Return type | +Assertion / Note / Pre- / Post-condition | +
charT c | +charT | +Default constructor (must be trivial). | +
charT c(c1) | +charT | +Copy constructor (must be trivial). | +
c1 = c2 | +charT | +Assignment operator (must be trivial). | +
c1 == c2 | +bool | +true if c1 has the same value as c2. | +
c1 != c2 | +bool | +true if c1 and c2 are not equal. | +
c1 < c2 | +bool | +true if the value of c1 is less than c2. | +
c1 > c2 | +bool | +true if the value of c1 is greater than c2. | +
c1 <= c2 | +bool | +true if c1 is less than or equal to c2. | +
c1 >= c2 | +bool | +true if c1 is greater than or equal to c2. | +
intmax_t i = c1 | +int | +
+ charT must be convertible to an integral type. +Note: type charT is not required to support this operation, if the traits class + used supports the full Boost-specific interface, rather than the minimal + standardised-interface (see traits class requirements below). + |
+
charT c(i); | +charT | +charT must be constructable from an integral type. | +
There are two sets of requirements for the traits template argument to + basic_regex: a mininal interface (which is part of the regex standardization + proposal), and an optional Boost-specific enhanced interface.
+In the following table X denotes a traits class defining types and functions + for the character container type charT; u is an object of type X; v is an + object of type const X; p is a value of type const charT*; I1 and I2 are Input + Iterators; c is a value of type const charT; s is an object of type + X::string_type; cs is an object of type const X::string_type; b is a value of + type bool; I is a value of type int; F1 and F2 are values of type const charT*; + and loc is an object of type X::locale_type.
++
+ Expression + |
+
+ Return type + |
+
+ Assertion / Note
+ |
+
+ X::char_type + |
+
+ charT + |
+
+ The character container type used in the implementation of class template |
+
+ X::size_type + |
+
+ + |
+
+ An unsigned integer type, capable of holding the length of a null-terminated + string of charT's. + |
+
+ X::string_type + |
+
+ std::basic_string<charT> or std::vector<charT> + |
+
+ + |
+
+ X::locale_type + |
+
+ Implementation defined + |
+
+ A copy constructible type that represents the locale used by the traits class. + |
+
+ X::char_class_type + |
+
+ Implementation defined + |
+
+ A bitmask type representing a particular character classification. Multiple + values of this type can be bitwise-or'ed together to obtain a new valid value. + |
+
+ X::length(p) + |
+
+ X::size_type + |
+
+ Yields the smallest |
+
+ v.translate(c) + |
+
+ X::char_type + |
+
+ Returns a character such that for any character d that is to be considered + equivalent to c then v.translate(c) == v.translate(d). + |
+
+ v.translate_nocase(c) + |
+ X::char_type | +For all characters C that are to be considered + equivalent to c when comparisons are to be performed without regard to case, + then v.translate_- nocase(c) == v.translate_- nocase(C). | +
+ v.transform(F1, F2) + |
+
+ X::string_type + |
+
+ Returns a sort key for the character sequence designated by the iterator range + [F1, F2) such that if the character sequence [G1, G2) sorts before the + character sequence [H1, H2) then v.transform(G1, G2) < v.transform(H1, + H2). + |
+
+ v.transform_primary(F1, F2) + |
+
+ X::string_type + |
+
+ Returns a sort key for the character sequence designated by the iterator range + [F1, F2) such that if the character sequence [G1, G2) sorts before the + character sequence [H1, H2) when character case is not considered then + v.transform_primary(G1, G2) < v.transform_- primary(H1, H2). + |
+
+ v.lookup_classname(F1, F2) + |
+
+ X::char_class_type + |
+
+ Converts the character sequence designated by the iterator range [F1,F2) into a + bitmask type that can subsequently be passed to isctype. Values returned from + lookup_classname can be safely bitwise or'ed together. Returns 0 if the + character sequence is not the name of a character class recognized by X. The + value returned shall be independent of the case of the characters in the + sequence. + |
+
+ v.lookup_collatename(F1, F2) + |
+
+ X::string_type + |
+
+ Returns a sequence of characters that represents the collating element + consisting of the character sequence designated by the iterator range [F1, F2). + Returns an empty string if the character sequence is not a valid collating + element. + |
+
+ v.isctype(c, v.lookup_classname (F1, F2)) + |
+
+ bool + |
+
+ Returns true if character c is a member of the character class designated by + the iterator range [F1, F2), false otherwise. + |
+
+ v.value(c, i) + |
+
+ int + |
+
+ Returns the value represented by the digit c in base I if the character c is a + valid digit in base I; otherwise returns -1. [Note: the value of I will only be + 8, 10, or 16. -end note] + |
+
+ u.imbue(loc) + |
+
+ X::locale_type + |
+
+ Imbues |
+
+ v.getloc() + |
+
+ X::locale_type + |
+
+ Returns the current locale used by |
+
+ v.error_string(i) + |
+
+ std::string + |
+
+ Returns a human readable error string for the error condition |
+
The following additional requirements are strictly optional, however in order + for basic_regex to take advantage of these additional interfaces, all of the + following requirements must be met; basic_regex will detect the presence or + absense of member boost_extensions_tag and configure itself + appropriately.
++
Expression | +Result | +
+ Assertion / Note
+ |
+
X::boost_extensions_tag | +An unspecified type. | +When present, all of the extensions listed in this table must be present. | +
+ v.syntax_type(c) + |
+ regex_constants::syntax_type | +
+ Returns a symbolic value of type |
+
v.escape_syntax_type(c) | +regex_constants::escape_syntax_type | +
+ Returns a symbolic value of type |
+
+ v.translate(c, b) + |
+ X::char_type | +
+ Returns a character |
+
+ v.toi(I1, I2, i) + |
+ An integer type capable of holding either a charT or an int. | +
+ Behaves as follows: if |
+
+ v.error_string(i) + |
+ std::string | +
+ Returns a human readable error string for the error condition |
+
v.tolower(c) | +X::char_type | +Converts c to lower case, used for Perl-style \l and \L formating operations. | +
v.toupper(c) | +X::char_type | +Converts c to upper case, used for Perl-style \u and \U formating operations. | +
+
The regular expression algorithms (and iterators) take all require a + Bidirectional-Iterator.
++
Revised + + 24 June 2004 +
+© Copyright John Maddock 1998- + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/configuration.html b/doc/configuration.html new file mode 100644 index 00000000..9da777e7 --- /dev/null +++ b/doc/configuration.html @@ -0,0 +1,155 @@ + + + +
+ |
+
+ Boost.Regex+Configuration and setup+ |
+
+ |
+
You shouldn't need to do anything special to configure boost.regex for use with + your compiler - the boost.config subsystem + should already take care of it, if you do have problems (or you are using a + particularly obscure compiler or platform) then boost.config has + a configure script.
+The following macros (see user.hpp) + control how boost.regex interacts with the user's locale:
+BOOST_REGEX_USE_C_LOCALE | ++ Forces boost.regex to use the global C locale in its traits class support: this + is now deprecated in favour of the C++ locale. | +
BOOST_REGEX_USE_CPP_LOCALE | +Forces boost.regex to use std::locale in it's default traits class, regular + expressions can then be imbued with an instance specific locale. + This is the default behaviour on non-Windows platforms. | +
BOOST_REGEX_NO_W32 | +Tells boost.regex not to use any Win32 API's even when available (implies + BOOST_REGEX_USE_CPP_LOCALE unless BOOST_REGEX_USE_C_LOCALE is set). | +
BOOST_REGEX_DYN_LINK | +For Microsoft and Borland C++ builds, this tells boost.regex that it should + link to the dll build of the boost.regex. By default boost.regex will + link to its static library build, even if the dynamic C runtime library is in + use. | +
BOOST_REGEX_NO_LIB | +For Microsoft and Borland C++ builds, this tells boost.regex that it should + not automatically select the library to link to. | +
BOOST_REGEX_RECURSIVE | +Tells boost.regex to use a stack-recursive matching algorithm. This is + generally the fastest option (although there is very little in it), but can + cause stack overflow in extreme cases, on Win32 this can be handled safely, but + this is not the case on other platforms. | +
BOOST_REGEX_NON_RECURSIVE | +Tells boost.regex to use a non-stack recursive matching algorithm, this can be + slightly slower than the alternative, but is always safe no matter how + pathological the regular expression. This is the default on non-Win32 + platforms. | +
The following option applies only if BOOST_REGEX_RECURSIVE is set.
+BOOST_REGEX_HAS_MS_STACK_GUARD | +Tells boost.regex that Microsoft style __try - __except blocks are supported, + and can be used to safely trap stack overflow. | +
The following options apply only if BOOST_REGEX_NON_RECURSIVE is set.
+BOOST_REGEX_BLOCKSIZE | +In non-recursive mode, boost.regex uses largish blocks of memory to act as a + stack for the state machine, the larger the block size then the fewer + allocations that will take place. This defaults to 4096 bytes, which is + large enough to match the vast majority of regular expressions without + further allocations, however, you can choose smaller or larger values depending + upon your platforms characteristics. | +
BOOST_REGEX_MAX_BLOCKS | +Tells boost.regex how many blocks of size BOOST_REGEX_BLOCKSIZE it is + permitted to use. If this value is exceeded then boost.regex will stop + trying to find a match and throw a std::runtime_error. Defaults to 1024, + don't forget to tweek this value if you alter BOOST_REGEX_BLOCKSIZE by much. | +
BOOST_REGEX_MAX_CACHE_BLOCKS | +Tells boost.regex how many memory blocks to store in it's internal cache - + memory blocks are taken from this cache rather than by calling ::operator + new. Generally speeking this can be an order of magnitude faster than + calling ::opertator new each time a memory block is required, but has the + downside that boost.regex can end up caching a large chunk of memory (by + default up to 16 blocks each of BOOST_REGEX_BLOCKSIZE size). If memory is + tight then try defining this to 0 (disables all caching), or if that is too + slow, then a value of 1 or 2, may be sufficient. On the other hand, on + large multi-processor, multi-threaded systems, you may find that a higher value + is in order. | +
Revised + + 23 June 2004 +
+© Copyright John Maddock 1998- + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/contacts.html b/doc/contacts.html new file mode 100644 index 00000000..645460ec --- /dev/null +++ b/doc/contacts.html @@ -0,0 +1,87 @@ + + + +
+ |
+
+ Boost.Regex+Contacts and Acknowledgements+ |
+
+ |
+
The author can be contacted at john@johnmaddock.co.uk; the home page for + this library is at www.boost.org.
+I am indebted to Robert Sedgewick's + "Algorithms in C++" for forcing me to think about algorithms and their + performance, and to the folks at boost for + forcing me to think, period.
+Eric Niebler, author of the + GRETA regular expression component, has shared several important ideas, + in a series of long discussions.
+Pete Becker, of Dinkumware Ltd, has + helped enormously with the standardisation proposal language.
+The following people have all contributed useful comments or fixes: Dave + Abrahams, Mike Allison, Edan Ayal, Jayashree Balasubramanian, Jan Bölsche, + Beman Dawes, Paul Baxter, David Bergman, David Dennerline, Edward Diener, Peter + Dimov, Robert Dunn, Fabio Forno, Tobias Gabrielsson, Rob Gillen, Marc Gregoire, + Chris Hecker, Nick Hodapp, Jesse Jones, Martin Jost, Boris Krasnovskiy, Jan + Hermelink, Max Leung, Wei-hao Lin, Jens Maurer, Richard Peters, Heiko Schmidt, + Jason Shirk, Gerald Slacik, Scobie Smith, Mike Smyth, Alexander Sokolovsky, + Hervé Poirier, Michael Raykh, Marc Recht, Scott VanCamp, Bruno Voigt, Alexey + Voinov, Jerry Waldorf, Rob Ward, Lealon Watts, John Wismar, Thomas Witt and + Yuval Yosef. I am also grateful to the manuals supplied with the Henry Spencer, + Perl and GNU regular expression libraries - wherever possible I have tried to + maintain compatibility with these libraries and with the POSIX standard - the + code however is entirely my own, including any bugs! I can absolutely guarantee + that I will not fix any bugs I don't know about, so if you have any comments or + spot any bugs, please get in touch.
+Useful further information can be found at:
+Short tutorials on regular expressions can be + found here and here.
+The main book on regular expressions is + Mastering Regular Expressions, published by O'Reilly.
+Information on the + Boost.regex standardization proposal, along with other + standard library extension proposals can be found on the + C++ Committees web pages.
+TheOpen Unix + Specification contains a wealth of useful material, including the + regular expression syntax, and specifications for + <regex.h> and + <nl_types.h>.
+The Pattern Matching Pointers + site is a "must visit" resource for anyone interested in pattern matching.
+Glimpse and Agrep, use a + simplified regular expression syntax to achieve faster search times.
+Udi Manber and + Ricardo Baeza-Yates both have a selection of useful pattern matching + papers available from their respective web sites.
+ +Revised + + 24 Oct 2003 +
+© Copyright John Maddock 1998- + 2003
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/error_type.html b/doc/error_type.html new file mode 100644 index 00000000..6ca14383 --- /dev/null +++ b/doc/error_type.html @@ -0,0 +1,139 @@ + + + ++
+ |
+
+ Boost.Regex+error_type+ |
+
+ |
+
Type error type represents the different types of errors that can be raised by + the library when parsing a regular expression.
++namespace boost{ namespace regex_constants{ + +typedef implementation-specific-type error_type; + +static const error_type error_collate; +static const error_type error_ctype; +static const error_type error_escape; +static const error_type error_backref; +static const error_type error_brack; +static const error_type error_paren; +static const error_type error_brace; +static const error_type error_badbrace; +static const error_type error_range; +static const error_type error_space; +static const error_type error_badrepeat; +static const error_type error_complexity; +static const error_type error_stack; +static const error_type error_bad_pattern; + +} // namespace regex_constants +} // namespace boost ++
+
The type error_type is an implementation-specific enumeration type that may + take one of the following values:
++
Constant | +Meaning | +
error_collate | +An invalid collating element was specified in a [[.name.]] block. | +
error_ctype | +An invalid character class name was specified in a [[:name:]] block. | +
error_escape | +An invalid or trailing escape was encountered. | +
error_backref | +A back-reference to a non-existant marked sub-expression was encountered. | +
error_brack | +An invalid character set [...] was encountered. | +
error_paren | +
+ Mismatched '(' and ')'. + |
+
error_brace | +Mismatched '{' and '}'. | +
error_badbrace | +Invalid contents of a {...} block. | +
error_range | +A character range was invalid, for example [d-a]. | +
error_space | +Out of memory. | +
error_badrepeat | +An attempt to repeat something that can not be repeated - for example a*+ | +
error_complexity | +The expression became too complex to handle. | +
error_stack | +Out of program stack space. | +
error_bad_pattern | +Other unspecified errors. | +
Revised + + 24 June 2004 + +
+© Copyright John Maddock 1998- + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + + diff --git a/doc/examples.html b/doc/examples.html new file mode 100644 index 00000000..737b11ee --- /dev/null +++ b/doc/examples.html @@ -0,0 +1,117 @@ + + + +
+ |
+
+ Boost.Regex+Examples+ |
+
+ |
+
A regression test application that gives the matching/searching algorithms a + full workout. The presence of this program is your guarantee that the library + will behave as claimed - at least as far as those items tested are concerned - + if anyone spots anything that isn't being tested I'd be glad to hear about it.
+Directory: libs/regex/test/regress.
+Files: basic_tests.cpp + test_deprecated.cpp main.cpp.
+Verifies that "bad" regular expressions don't cause the matcher to go into + infinite loops, but to throw an exception instead.
+Directory: libs/regex/test/pathology.
+Files: bad_expression_test.cpp.
+Verifies that the matcher can't overrun the stack (no matter what the + expression).
+Directory: libs/regex/test/pathology.
+Files: recursion_test.cpp.
+Verifies that the library meets all documented concepts (a compile only test).
+Directory: libs/regex/test/concepts.
+Files: concept_check.cpp.
+Test code for captures.
+Directory: libs/test/captures.
+Files: captures_test.cpp.
+A simple grep implementation, run with the -h command line option to find out + its usage.
+Files: grep.cpp
+A simple interactive expression matching application, the results of all + matches are timed, allowing the programmer to optimize their regular + expressions where performance is critical.
+Files: regex_timer.cpp.
+The snippets examples contain the code examples used in the documentation:
+captures_example.cpp: + Demonstrates the use of captures.
+credit_card_example.cpp: + Credit card number formatting code.
+partial_regex_grep.cpp: + Search example using partial matches.
+partial_regex_match.cpp: + regex_match example using partial matches.
+regex_iterator_example.cpp: + Iterating through a series of matches.
+regex_match_example.cpp: + ftp based regex_match example.
+regex_merge_example.cpp: + regex_merge example: converts a C++ file to syntax highlighted HTML.
+regex_replace_example.cpp: + regex_replace example: converts a C++ file to syntax highlighted HTML
+regex_search_example.cpp: + regex_search example: searches a cpp file for class definitions.
+regex_token_iterator_eg_1.cpp: + split a string into a series of tokens.
+regex_token_iterator_eg_2.cpp: + enumerate the linked URL's in a HTML file.
+The following are deprecated:
+regex_grep_example_1.cpp: + regex_grep example 1: searches a cpp file for class definitions.
+regex_grep_example_2.cpp: + regex_grep example 2: searches a cpp file for class definitions, using a global + callback function.
+regex_grep_example_3.cpp: + regex_grep example 2: searches a cpp file for class definitions, using a bound + member function callback.
+regex_grep_example_4.cpp: + regex_grep example 2: searches a cpp file for class definitions, using a C++ + Builder closure as a callback.
+regex_split_example_1.cpp: + regex_split example: split a string into tokens.
+regex_split_example_2.cpp + : regex_split example: spit out linked URL's.
+ +Revised + + 28 June 2004 +
+© Copyright John Maddock 1998- + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/faq.html b/doc/faq.html new file mode 100644 index 00000000..e9557443 --- /dev/null +++ b/doc/faq.html @@ -0,0 +1,114 @@ + + + +
+ |
+
+ Boost.Regex+FAQ+ |
+
+ |
+
Q. Why can't I + use the "convenience" versions of regex_match / regex_search / regex_grep / + regex_format / regex_merge?
+A. These versions may or may not be available depending upon the capabilities + of your compiler, the rules determining the format of these functions are quite + complex - and only the versions visible to a standard compliant compiler are + given in the help. To find out what your compiler supports, run + <boost/regex.hpp> through your C++ pre-processor, and search the output + file for the function that you are interested in.
+Q. I can't get regex++ to work with + escape characters, what's going on?
+A. If you embed regular expressions in C++ code, then remember that escape + characters are processed twice: once by the C++ compiler, and once by the + regex++ expression compiler, so to pass the regular expression \d+ to regex++, + you need to embed "\\d+" in your code. Likewise to match a literal backslash + you will need to embed "\\\\" in your code. +
+Q. Why does using parenthesis in a POSIX regular expression + change the result of a match?
+For POSIX (extended and basic) regular expressions, but not for perl regexes, + parentheses don't only mark; they determine what the best match is as well. + When the expression is compiled as a POSIX basic or extended regex then + Boost.regex follows the POSIX standard leftmost longest rule for determining + what matched. So if there is more than one possible match after considering the + whole expression, it looks next at the first sub-expression and then the second + sub-expression and so on. So...
++"(0*)([0-9]*)" against "00123" would produce +$1 = "00" +$2 = "123" ++
where as
++"0*([0-9])*" against "00123" would produce +$1 = "00123" ++
If you think about it, had $1 only matched the "123", this would be "less good" + than the match "00123" which is both further to the left and longer. If you + want $1 to match only the "123" part, then you need to use something like:
++"0*([1-9][0-9]*)" ++
as the expression.
+Q. Why don't character ranges work properly (POSIX mode
+ only)?
+ A. The POSIX standard specifies that character range expressions are locale
+ sensitive - so for example the expression [A-Z] will match any collating
+ element that collates between 'A' and 'Z'. That means that for most locales
+ other than "C" or "POSIX", [A-Z] would match the single character 't' for
+ example, which is not what most people expect - or at least not what most
+ people have come to expect from regular expression engines. For this reason,
+ the default behaviour of boost.regex (perl mode) is to turn locale sensitive
+ collation off by not setting the regex_constants::collate compile time flag.
+ However if you set a non-default compile time flag - for example
+ regex_constants::extended or regex_constants::basic, then locale dependent
+ collation will be enabled, this also applies to the POSIX API functions which
+ use either regex_constants::extended or regex_constants::basic internally. [Note
+ - when regex_constants::nocollate in effect, the library behaves "as if" the
+ LC_COLLATE locale category were always "C", regardless of what its actually set
+ to - end note].
Q. Why are there no throw specifications on any of the + functions? What exceptions can the library throw?
+A. Not all compilers support (or honor) throw specifications, others support + them but with reduced efficiency. Throw specifications may be added at a later + date as compilers begin to handle this better. The library should throw only + three types of exception: boost::bad_expression can be thrown by basic_regex + when compiling a regular expression, std::runtime_error can be thrown when a + call to basic_regex::imbue tries to open a message catalogue that doesn't + exist, or when a call to regex_search or regex_match results in an + "everlasting" search, or when a call to RegEx::GrepFiles or + RegEx::FindFiles tries to open a file that cannot be opened, finally + std::bad_alloc can be thrown by just about any of the functions in this + library.
+ +Revised + + 24 Oct 2003 +
+© Copyright John Maddock 1998- + 2003
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/format_boost_syntax.html b/doc/format_boost_syntax.html new file mode 100644 index 00000000..d497ddeb --- /dev/null +++ b/doc/format_boost_syntax.html @@ -0,0 +1,163 @@ + + + ++
+ |
+
+ Boost.Regex+Boost-Extended Format String Syntax+ |
+
+ |
+
Boost-Extended format strings treat all characters as literals except for + '$', '\', '(', ')', '?', ':' and '\'.
+The characters '(' and ')' perform lexical grouping, use \( and \) if you want + a to output literal parenthesis.
+The character '?' begins a conditional expression, the general form is:
+?Ntrue-expression:false-expression+
where N is decimal digit.
+If sub-expression N was matched, then true-expression is evaluated and + sent to output, otherwise false-expression is evaluated and sent to output.
+You will normally need to surround a conditional-expression with parenthesis in + order to prevent ambiguities.
+Placeholder sequences specify that some part of what matched the regular + expression should be sent to output as follows:
++
Placeholder | +Meaning | +
$& | +Outputs what matched the whole expression. | +
$` | +Outputs the text between the end of the last match found (or the start of the + text if no previous match was found), and the start of the current match. | +
$' | +Outputs all the text following the end of the current match. | +
$$ | +Outputs a literal '$' | +
$n | +Outputs what matched the n'th sub-expression. | +
Any $-placeholder sequence not listed above, results in '$' being treated as a + literal.
+An escape character followed by any character x, outputs that + character unless x is one of the escape sequences shown below.
++
Escape | +Meaning | +
\a | +Outputs the bell character: '\a'. | +
\e | +Outputs the ANSI escape character (code point 27). | +
\f | +Outputs a form feed character: '\f' | +
\n | +Outputs a newline character: '\n'. | +
\r | +Outputs a carriage return character: '\r'. | +
\t | +Outputs a tab character: '\t'. | +
\v | +Outputs a vertical tab character: '\v'. | +
\xDD | +Outputs the character whose hexadecimal code point is 0xDD | +
\x{DDDD} | +Outputs the character whose hexadecimal code point is 0xDDDDD | +
\cX | +Outputs the ANSI escape sequence "escape-X". | +
\D | +If D is a decimal digit in the range 1-9, then outputs the text that + matched sub-expression D. | +
\l | +Causes the next character to be outputted, to be output in lower case. | +
\u | +Causes the next character to be outputted, to be output in upper case. | +
\L | +Causes all subsequent characters to be output in lower case, until a \E is + found. | +
\U | +Causes all subsequent characters to be output in upper case, until a \E is + found. | +
\E | +Terminates a \L or \U sequence. | +
+
Revised + + 24 Nov 2004 +
+© Copyright John Maddock 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/format_perl_syntax.html b/doc/format_perl_syntax.html new file mode 100644 index 00000000..481a141a --- /dev/null +++ b/doc/format_perl_syntax.html @@ -0,0 +1,150 @@ + + + ++
+ |
+
+ Boost.Regex+Perl-Style Format String Syntax+ |
+
+ |
+
Perl-style format strings treat all characters as literals except '$' and '\' + which start placeholder and escape sequences respectively.
+Placeholder sequences specify that some part of what matched the regular + expression should be sent to output as follows:
++
Placeholder | +Meaning | +
$& | +Outputs what matched the whole expression. | +
$` | +Outputs the text between the end of the last match found (or the start of the + text if no previous match was found), and the start of the current match. | +
$' | +Outputs all the text following the end of the current match. | +
$$ | +Outputs a literal '$' | +
$n | +Outputs what matched the n'th sub-expression. | +
Any $-placeholder sequence not listed above, results in '$' being treated as a + literal.
+An escape character followed by any character x, outputs that + character unless x is one of the escape sequences shown below.
++
Escape | +Meaning | +
\a | +Outputs the bell character: '\a'. | +
\e | +Outputs the ANSI escape character (code point 27). | +
\f | +Outputs a form feed character: '\f' | +
\n | +Outputs a newline character: '\n'. | +
\r | +Outputs a carriage return character: '\r'. | +
\t | +Outputs a tab character: '\t'. | +
\v | +Outputs a vertical tab character: '\v'. | +
\xDD | +Outputs the character whose hexadecimal code point is 0xDD | +
\x{DDDD} | +Outputs the character whose hexadecimal code point is 0xDDDDD | +
\cX | +Outputs the ANSI escape sequence "escape-X". | +
\D | +If D is a decimal digit in the range 1-9, then outputs the text that + matched sub-expression D. | +
\l | +Causes the next character to be outputted, to be output in lower case. | +
\u | +Causes the next character to be outputted, to be output in upper case. | +
\L | +Causes all subsequent characters to be output in lower case, until a \E is + found. | +
\U | +Causes all subsequent characters to be output in upper case, until a \E is + found. | +
\E | +Terminates a \L or \U sequence. | +
+
Revised + + 24 Nov 2004 +
+© Copyright John Maddock 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/format_sed_syntax.html b/doc/format_sed_syntax.html new file mode 100644 index 00000000..ee0a331b --- /dev/null +++ b/doc/format_sed_syntax.html @@ -0,0 +1,109 @@ + + + ++
+ |
+
+ Boost.Regex+Sed-Style Format String Syntax+ |
+
+ |
+
Sed-style format strings treat all characters as literals except:
++
& | +The ampersand character is replaced in the output stream by the the whole of + what matched the regular expression. Use \& to output a literal + '&' character. | +
\ | +Specifies an escape sequence. | +
+
An escape character followed by any character x, outputs that + character unless x is one of the escape sequences shown below.
++
Escape | +Meaning | +
\a | +Outputs the bell character: '\a'. | +
\e | +Outputs the ANSI escape character (code point 27). | +
\f | +Outputs a form feed character: '\f' | +
\n | +Outputs a newline character: '\n'. | +
\r | +Outputs a carriage return character: '\r'. | +
\t | +Outputs a tab character: '\t'. | +
\v | +Outputs a vertical tab character: '\v'. | +
\xDD | +Outputs the character whose hexadecimal code point is 0xDD | +
\x{DDDD} | +Outputs the character whose hexadecimal code point is 0xDDDDD | +
\cX | +Outputs the ANSI escape sequence "escape-X". | +
\D | +If D is a decimal digit in the range 1-9, then outputs the text that + matched sub-expression D. | +
+
Revised + + 24 Nov 2004 +
+© Copyright John Maddock 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/format_syntax.html b/doc/format_syntax.html new file mode 100644 index 00000000..c7061d21 --- /dev/null +++ b/doc/format_syntax.html @@ -0,0 +1,52 @@ + + + +
+ |
+
+ Boost.Regex+Format String Syntax+ |
+
+ |
+
Format strings are used by the algorithm regex_replace and + by match_results::format, and are used to + transform one string into another.
++ There are three kind of format string: Sed, Perl and Boost-extended.
+Alternatively, when the flag format_literal
is passed to one of these
+ functions, then the format string is treated as a string literal, and is copied
+ unchanged to the output.
Sed Style Format Strings
+ Perl Style Format Strings
+ Boost-Extended Format Strings
Revised + + 24 Nov 2004 +
+© Copyright John Maddock 1998- + + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/headers.html b/doc/headers.html new file mode 100644 index 00000000..031d33fe --- /dev/null +++ b/doc/headers.html @@ -0,0 +1,48 @@ + + + ++
+ |
+
+ Boost.Regex+Headers+ |
+
+ |
+
There are two main headers used by this library: <boost/regex.hpp> + provides full access to the main template library, while + <boost/cregex.hpp> provides access to the (deprecated) high level class + RegEx, and the POSIX API functions. +
+There is also a header containing only forward declarations + <boost/regex_fwd.hpp> for use when an interface is dependent upon + boost::basic_regex, but otherwise does not need the full definitions.
++
Revised + + 28 June 2004 +
+© Copyright John Maddock 1998- + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/history.html b/doc/history.html new file mode 100644 index 00000000..72a50f03 --- /dev/null +++ b/doc/history.html @@ -0,0 +1,177 @@ + + + ++
+ |
+
+ Boost.Regex+History+ |
+
+ |
+
Boost 1.34
+Boost 1.33.1
+Boost 1.33.0.
+format_literal
+ that treats the replace string as a literal, rather than a Perl or Sed style
+ format string.
+ regex_error
. The types used previously - bad_expression
+ and bad_pattern
- are now just typedefs for regex_error
.
+ Type regex_error
has a couple of new members: code()
to
+ report an error code rather than a string, and position()
to
+ report where in the expression the error occured.Boost 1.32.1.
+Boost 1.31.0.
++
Revised + + 28 June 2004 +
+© Copyright John Maddock 1998- + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/icu_strings.html b/doc/icu_strings.html new file mode 100644 index 00000000..39548198 --- /dev/null +++ b/doc/icu_strings.html @@ -0,0 +1,468 @@ + + + ++
+ |
+
+ Boost.Regex+Working With Unicode and ICU String Types.+ |
+
+ |
+
The header:
+<boost/regex/icu.hpp>+
contains the data types and algorithms necessary for working with regular + expressions in a Unicode aware environment. +
+In order to use this header you will need + the ICU library, and you will need to have built the Boost.Regex library + with ICU support enabled.
+The header will enable you to:
+Header <boost/regex/icu.hpp> provides a regular expression traits + class that handles UTF-32 characters:
+class icu_regex_traits;+
and a regular expression type based upon that:
+typedef basic_regex<UChar32,icu_regex_traits> u32regex;+
The type u32regex is regular expression type to use for all Unicode + regular expressions; internally it uses UTF-32 code points, but can be created + from, and used to search, either UTF-8, or UTF-16 encoded strings as well as + UTF-32 ones.
+The constructors, and + assign member functions of u32regex, require UTF-32 encoded strings, but + there are a series of overloaded algorithms called make_u32regex which allow + regular expressions to be created from UTF-8, UTF-16, or UTF-32 encoded + strings:
+template <class InputIterator> +u32regex make_u32regex(InputIterator i, InputIterator j, boost::regex_constants::syntax_option_type opt); ++
Effects: Creates a regular expression object from the iterator
+ sequence [i,j). The character encoding of the sequence is determined based upon
+ sizeof(*i)
: 1 implies UTF-8, 2 implies UTF-16, and 4 implies UTF-32.
u32regex make_u32regex(const char* p, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl); ++
Effects: Creates a regular expression object from the + Null-terminated UTF-8 characater sequence p.
+u32regex make_u32regex(const unsigned char* p, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl);+
Effects: Creates a regular expression object from the + Null-terminated UTF-8 characater sequence p.u32regex + make_u32regex(const wchar_t* p, boost::regex_constants::syntax_option_type opt + = boost::regex_constants::perl);
+Effects: Creates a regular expression object from the
+ Null-terminated characater sequence p. The character encoding of
+ the sequence is determined based upon sizeof(wchar_t)
: 1 implies
+ UTF-8, 2 implies UTF-16, and 4 implies UTF-32.
u32regex make_u32regex(const UChar* p, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl);+
Effects: Creates a regular expression object from the + Null-terminated UTF-16 characater sequence p.
+template<class C, class T, class A> +u32regex make_u32regex(const std::basic_string<C, T, A>& s, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl);+
Effects: Creates a regular expression object from the string s.
+ The character encoding of the string is determined based upon sizeof(C)
:
+ 1 implies UTF-8, 2 implies UTF-16, and 4 implies UTF-32.
u32regex make_u32regex(const UnicodeString& s, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl);+
Effects: Creates a regular expression object from the UTF-16 + encoding string s.
+The regular expression algorithms regex_match, + regex_search and regex_replace all + expect that the character sequence upon which they operate, is encoded in the + same character encoding as the regular expression object with which they are + used. For Unicode regular expressions that behavior is undesirable: while + we may want to process the data in UTF-32 "chunks", the actual data is much + more likely to encoded as either UTF-8 or UTF-16. Therefore the header + <boost/regex/icu.hpp> provides a series of thin wrappers around these + algorithms, called u32regex_match, u32regex_search, and u32regex_replace. + These wrappers use iterator-adapters internally to make external UTF-8 or + UTF-16 data look as though it's really a UTF-32 sequence, that can then be + passed on to the "real" algorithm.
+For each regex_match algorithm defined by + <boost/regex.hpp>, then <boost/regex/icu.hpp> defines an overloaded + algorithm that takes the same arguments, but which is called u32regex_match, + and which will accept UTF-8, UTF-16 or UTF-32 encoded data, as well as an + ICU UnicodeString as input.
+Example: match a password, encoded in a UTF-16 UnicodeString:
+// +// Find out if *password* meets our password requirements, +// as defined by the regular expression *requirements*. +// +bool is_valid_password(const UnicodeString& password, const UnicodeString& requirements) +{ + return boost::u32regex_match(password, boost::make_u32regex(requirements)); +} ++
+
Example: match a UTF-8 encoded filename:
+// +// Extract filename part of a path from a UTF-8 encoded std::string and return the result +// as another std::string: +// +std::string get_filename(const std::string& path) +{ + boost::u32regex r = boost::make_u32regex("(?:\\A|.*\\\\)([^\\\\]+)"); + boost::smatch what; + if(boost::u32regex_match(path, what, r)) + { + // extract $1 as a CString: + return what.str(1); + } + else + { + throw std::runtime_error("Invalid pathname"); + } +} ++
For each regex_search algorithm defined by + <boost/regex.hpp>, then <boost/regex/icu.hpp> defines an overloaded + algorithm that takes the same arguments, but which is called u32regex_search, + and which will accept UTF-8, UTF-16 or UTF-32 encoded data, as well as an + ICU UnicodeString as input.
+Example: search for a character sequence in a specific + language block: +
+UnicodeString extract_greek(const UnicodeString& text) +{ + // searches through some UTF-16 encoded text for a block encoded in Greek, + // this expression is imperfect, but the best we can do for now - searching + // for specific scripts is actually pretty hard to do right. + // + // Here we search for a character sequence that begins with a Greek letter, + // and continues with characters that are either not-letters ( [^[:L*:]] ) + // or are characters in the Greek character block ( [\\x{370}-\\x{3FF}] ). + // + boost::u32regex r = boost::make_u32regex(L"[\\x{370}-\\x{3FF}](?:[^[:L*:]]|[\\x{370}-\\x{3FF}])*"); + boost::u16match what; + if(boost::u32regex_search(text, what, r)) + { + // extract $0 as a CString: + return UnicodeString(what[0].first, what.length(0)); + } + else + { + throw std::runtime_error("No Greek found!"); + } +}+
For each regex_replace algorithm defined by + <boost/regex.hpp>, then <boost/regex/icu.hpp> defines an overloaded + algorithm that takes the same arguments, but which is called u32regex_replace, + and which will accept UTF-8, UTF-16 or UTF-32 encoded data, as well as an + ICU UnicodeString as input. The input sequence and the format string + specifier passed to the algorithm, can be encoded independently (for example + one can be UTF-8, the other in UTF-16), but the result string / output iterator + argument must use the same character encoding as the text being searched.
+Example: Credit card number reformatting:
+// +// Take a credit card number as a string of digits, +// and reformat it as a human readable string with "-" +// separating each group of four digit;, +// note that we're mixing a UTF-32 regex, with a UTF-16 +// string and a UTF-8 format specifier, and it still all +// just works: +// +const boost::u32regex e = boost::make_u32regex("\\A(\\d{3,4})[- ]?(\\d{4})[- ]?(\\d{4})[- ]?(\\d{4})\\z"); +const char* human_format = "$1-$2-$3-$4"; + +UnicodeString human_readable_card_number(const UnicodeString& s) +{ + return boost::u32regex_replace(s, e, human_format); +}+
+
Type u32regex_iterator is in all respects the same as + regex_iterator except that since the regular expression type is always + u32regex it only takes one template parameter (the iterator type). It also + calls u32regex_search internally, allowing it to interface correctly with + UTF-8, UTF-16, and UTF-32 data:
++template <class BidirectionalIterator> +class u32regex_iterator +{ + // for members see regex_iterator +}; + +typedef u32regex_iterator<const char*> utf8regex_iterator; +typedef u32regex_iterator<const UChar*> utf16regex_iterator; +typedef u32regex_iterator<const UChar32*> utf32regex_iterator; ++
In order to simplify the construction of a u32regex_iterator from a string, + there are a series of non-member helper functions called + make_u32regex_iterator:
++u32regex_iterator<const char*> + make_u32regex_iterator(const char* s, + const u32regex& e, + regex_constants::match_flag_type m = regex_constants::match_default); + +u32regex_iterator<const wchar_t*> + make_u32regex_iterator(const wchar_t* s, + const u32regex& e, + regex_constants::match_flag_type m = regex_constants::match_default); + +u32regex_iterator<const UChar*> + make_u32regex_iterator(const UChar* s, + const u32regex& e, + regex_constants::match_flag_type m = regex_constants::match_default); + +template <class charT, class Traits, class Alloc> +u32regex_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator> + make_u32regex_iterator(const std::basic_string<charT, Traits, Alloc>& s, + const u32regex& e, + regex_constants::match_flag_type m = regex_constants::match_default); + +u32regex_iterator<const UChar*> + make_u32regex_iterator(const UnicodeString& s, + const u32regex& e, + regex_constants::match_flag_type m = regex_constants::match_default);+
+
Each of these overloads returns an iterator that enumerates all occurrences of + expression e, in text s, using match_flags m.
+Example: search for international currency symbols, along with + their associated numeric value:
++void enumerate_currencies(const std::string& text) +{ + // enumerate and print all the currency symbols, along + // with any associated numeric values: + const char* re = + "([[:Sc:]][[:Cf:][:Cc:][:Z*:]]*)?" + "([[:Nd:]]+(?:[[:Po:]][[:Nd:]]+)?)?" + "(?(1)" + "|(?(2)" + "[[:Cf:][:Cc:][:Z*:]]*" + ")" + "[[:Sc:]]" + ")"; + boost::u32regex r = boost::make_u32regex(re); + boost::u32regex_iterator<std::string::const_iterator> i(boost::make_u32regex_iterator(text, r)), j; + while(i != j) + { + std::cout << (*i)[0] << std::endl; + ++i; + } +}+
+
Calling +
+enumerate_currencies(" $100.23 or £198.12 ");+
Yields the output:
+$100.23+
£198.12
Provided of course that the input is encoded as UTF-8.
+Type u32regex_token_iterator is in all respects the same as + regex_token_iterator except that since the regular expression type is + always u32regex it only takes one template parameter (the iterator type). + It also calls u32regex_search internally, allowing it to interface correctly + with UTF-8, UTF-16, and UTF-32 data:
+template <class BidirectionalIterator> +class u32regex_token_iterator +{ + // for members see regex_token_iterator +}; + +typedef u32regex_token_iterator<const char*> utf8regex_token_iterator; +typedef u32regex_token_iterator<const UChar*> utf16regex_token_iterator; +typedef u32regex_token_iterator<const UChar32*> utf32regex_token_iterator; ++
In order to simplify the construction of a u32regex_token_iterator from a + string, there are a series of non-member helper functions called + make_u32regex_token_iterator:
++u32regex_token_iterator<const char*> + make_u32regex_token_iterator(const char* s, + const u32regex& e, + int sub, + regex_constants::match_flag_type m = regex_constants::match_default); + +u32regex_token_iterator<const wchar_t*> + make_u32regex_token_iterator(const wchar_t* s, + const u32regex& e, + int sub, + regex_constants::match_flag_type m = regex_constants::match_default); + +u32regex_token_iterator<const UChar*> + make_u32regex_token_iterator(const UChar* s, + const u32regex& e, + int sub, + regex_constants::match_flag_type m = regex_constants::match_default); + +template <class charT, class Traits, class Alloc> +u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator> + make_u32regex_token_iterator(const std::basic_string<charT, Traits, Alloc>& s, + const u32regex& e, + int sub, + regex_constants::match_flag_type m = regex_constants::match_default); + +u32regex_token_iterator<const UChar*> + make_u32regex_token_iterator(const UnicodeString& s, + const u32regex& e, + int sub, + regex_constants::match_flag_type m = regex_constants::match_default);+
+
Each of these overloads returns an iterator that enumerates all occurrences of + marked sub-expression sub in regular expression e, found + in text s, using match_flags m.
++template <std::size_t N> +u32regex_token_iterator<const char*> + make_u32regex_token_iterator(const char* p, + const u32regex& e, + const int (&submatch)[N], + regex_constants::match_flag_type m = regex_constants::match_default); + +template <std::size_t N> +u32regex_token_iterator<const wchar_t*> + make_u32regex_token_iterator(const wchar_t* p, + const u32regex& e, + const int (&submatch)[N], + regex_constants::match_flag_type m = regex_constants::match_default); + +template <std::size_t N> +u32regex_token_iterator<const UChar*> + make_u32regex_token_iterator(const UChar* p, + const u32regex& e, + const int (&submatch)[N], + regex_constants::match_flag_type m = regex_constants::match_default); + +template <class charT, class Traits, class Alloc, std::size_t N> +u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator> + make_u32regex_token_iterator(const std::basic_string<charT, Traits, Alloc>& p, + const u32regex& e, + const int (&submatch)[N], + regex_constants::match_flag_type m = regex_constants::match_default); + +template <std::size_t N> +u32regex_token_iterator<const UChar*> + make_u32regex_token_iterator(const UnicodeString& s, + const u32regex& e, + const int (&submatch)[N], + regex_constants::match_flag_type m = regex_constants::match_default); ++
Each of these overloads returns an iterator that enumerates one sub-expression + for each submatch in regular expression e, found in + text s, using match_flags m.
++u32regex_token_iterator<const char*> + make_u32regex_token_iterator(const char* p, + const u32regex& e, + const std::vector<int>& submatch, + regex_constants::match_flag_type m = regex_constants::match_default); + +u32regex_token_iterator<const wchar_t*> + make_u32regex_token_iterator(const wchar_t* p, + const u32regex& e, + const std::vector<int>& submatch, + regex_constants::match_flag_type m = regex_constants::match_default); + +u32regex_token_iterator<const UChar*> + make_u32regex_token_iterator(const UChar* p, + const u32regex& e, + const std::vector<int>& submatch, + regex_constants::match_flag_type m = regex_constants::match_default); + +template <class charT, class Traits, class Alloc> +u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator> + make_u32regex_token_iterator(const std::basic_string<charT, Traits, Alloc>& p, + const u32regex& e, + const std::vector<int>& submatch, + regex_constants::match_flag_type m = regex_constants::match_default); + +u32regex_token_iterator<const UChar*> + make_u32regex_token_iterator(const UnicodeString& s, + const u32regex& e, + const std::vector<int>& submatch, + regex_constants::match_flag_type m = regex_constants::match_default); ++
Each of these overloads returns an iterator that enumerates one sub-expression + for each submatch in regular expression e, found in + text s, using match_flags m.
+Example: search for international currency symbols, along with + their associated numeric value:
++void enumerate_currencies2(const std::string& text) +{ + // enumerate and print all the currency symbols, along + // with any associated numeric values: + const char* re = + "([[:Sc:]][[:Cf:][:Cc:][:Z*:]]*)?" + "([[:Nd:]]+(?:[[:Po:]][[:Nd:]]+)?)?" + "(?(1)" + "|(?(2)" + "[[:Cf:][:Cc:][:Z*:]]*" + ")" + "[[:Sc:]]" + ")"; + boost::u32regex r = boost::make_u32regex(re); + boost::u32regex_token_iterator<std::string::const_iterator> + i(boost::make_u32regex_token_iterator(text, r, 1)), j; + while(i != j) + { + std::cout << *i << std::endl; + ++i; + } +} ++
+
Revised + + 05 Jan 2005 +
+© Copyright John Maddock 2005
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + + + diff --git a/doc/implementation.html b/doc/implementation.html new file mode 100644 index 00000000..d2a9b5f2 --- /dev/null +++ b/doc/implementation.html @@ -0,0 +1,43 @@ + + + ++
+ |
+
+ Boost.Regex+Implementation+ |
+
+ |
+
Todo.
++
Revised + + 24 Oct 2003 +
+© Copyright John Maddock 1998- + + 2003
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + + diff --git a/doc/install.html b/doc/install.html new file mode 100644 index 00000000..4e4834c9 --- /dev/null +++ b/doc/install.html @@ -0,0 +1,260 @@ + + + ++
+ |
+
+ Boost.Regex+Installation+ |
+
+ |
+
When you extract the library from its zip file, you must preserve its internal + directory structure (for example by using the -d option when extracting). If + you didn't do that when extracting, then you'd better stop reading this, delete + the files you just extracted, and try again! +
+This library should not need configuring before use; most popular + compilers/standard libraries/platforms are already supported "as is". If you do + experience configuration problems, or just want to test the configuration with + your compiler, then the process is the same as for all of boost; see the + configuration library documentation.
+The library will encase all code inside namespace boost. +
+Unlike some other template libraries, this library consists of a mixture of + template code (in the headers) and static code and data (in cpp files). + Consequently it is necessary to build the library's support code into a library + or archive file before you can use it, instructions for specific platforms are + as follows: +
+This is now the preferred method for building and installing this library, + please refer to the getting started + guide for more information.
+A default build of this library does not enable Unciode + support via ICU. There is no need to enable this support if you + don't need it, but if you use ICU for your Unicode support already, and want to + work with Unicode-aware regular expressions then read on.
+Most of the information you will need is in the + getting started guide, the only additional step you need to take is to + tell bjam that you want Boost.Regex to use ICU and optionally to tell bjam + where ICU is located.
+If you're building on a Unix-like platform, and ICU is already installed in + you're compilers search path (with an install prefix of /usr or /usr/local + for example), then set the environment variable HAVE_ICU to enable ICU + support. For example you might build with the command line:
+bjam -sHAVE_ICU=1 --toolset=toolset-name install+
If ICU is not already in your compilers path then you need to set the + environment variable ICU_PATH to point to the route directory of your ICU + installation, for example if ICU was installed to /usr/local/icu/3.3 you might + use:
+bjam -sICU_PATH=/usr/local/icu/3.3 --toolset=toolset-name install+
Note that ICU is a C++ library just like Boost is, as such your copy of ICU + must have been built with the same C++ compiler (and compiler version) that you + are using to build Boost. Boost.Regex will not work correctly unless + you ensure that this is the case: it is up to you to ensure that + the version of ICU you are using is binary compatible with the toolset you use + to build Boost.
+make -fbcb5.mak+
The build process will build a variety of .lib and .dll files (the exact number + depends upon the version of Borland's tools you are using) the .lib and dll + files will be in a sub-directory called bcb4 or bcb5 depending upon the + makefile used. To install the libraries into your development system use:
+make -fbcb5.mak install+
library files will be copied to <BCROOT>/lib and the dll's to + <BCROOT>/bin, where <BCROOT> corresponds to the install path of + your Borland C++ tools. +
+You may also remove temporary files created during the build process (excluding + lib and dll files) by using:
+make -fbcb5.mak clean+
Finally when you use regex++ it is only necessary for you to add the + <boost> root director to your list of include directories for that + project. It is not necessary for you to manually add a .lib file to the + project; the headers will automatically select the correct .lib file for your + build mode and tell the linker to include it. There is one caveat however: the + library can not tell the difference between VCL and non-VCL enabled builds when + building a GUI application from the command line, if you build from the command + line with the 5.5 command line tools then you must define the pre-processor + symbol _NO_VCL in order to ensure that the correct link libraries are selected: + the C++ Builder IDE normally sets this automatically. Hint, users of the 5.5 + command line tools may want to add a -D_NO_VCL to bcc32.cfg in order to set + this option permanently. +
+If you would prefer to do a dynamic link to the regex libraries when using the + dll runtime then define BOOST_REGEX_DYN_LINK (you must do this if you want to + use boost.regex in multiple dll's), otherwise Boost.regex will be statically + linked by default.
+If you want to suppress automatic linking altogether (and supply your own + custom build of the lib) then define BOOST_REGEX_NO_LIB.
+If you are building with C++ Builder 6, you will find that + <boost/regex.hpp> can not be used in a pre-compiled header (the actual + problem is in <locale> which gets included by <boost/regex.hpp>), + if this causes problems for you, then try defining BOOST_NO_STD_LOCALE when + building, this will disable some features throughout boost, but may save you a + lot in compile times!
+You need version 6 of MSVC to build this library. If you are using VC5 then you + may want to look at one of the previous releases of this + library +
+Open up a command prompt, which has the necessary MSVC environment variables + defined (for example by using the batch file Vcvars32.bat installed by the + Visual Studio installation), and change to the <boost>\libs\regex\build + directory. +
+Select the correct makefile - vc6.mak for "vanilla" Visual C++ 6 or + vc6-stlport.mak if you are using STLPort.
+Invoke the makefile like this:
+nmake -fvc6.mak+
You will now have a collection of lib and dll files in a "vc6" subdirectory, to + install these into your development system use:
+nmake -fvc6.mak install+
The lib files will be copied to your <VC6>\lib directory and the dll + files to <VC6>\bin, where <VC6> is the root of your Visual C++ 6 + installation.
+You can delete all the temporary files created during the build (excluding lib + and dll files) using:
+nmake -fvc6.mak clean+
If you want to build with ICU support, then you need to pass the path to your + ICU directory to the makefile, for example with: +
+nmake ICU_PATH=c:\open-source\icu -fvc71.mak install+
Finally when you use regex++ it is only necessary for you to add the + <boost> root directory to your list of include directories for that + project. It is not necessary for you to manually add a .lib file to the + project; the headers will automatically select the correct .lib file for your + build mode and tell the linker to include it. +
+Note that if you want to dynamically link to the regex library when using the + dynamic C++ runtime, define BOOST_REGEX_DYN_LINK when building your project.
+If you want to add the source directly to your project then define + BOOST_REGEX_NO_LIB to disable automatic library selection.
+There are several important caveats to remember when using boost.regex with + Microsoft's Compiler:
+You can build with gcc using the normal boost Jamfile in + <boost>/libs/regex/build, alternatively there is a conservative makefile + for the g++ compiler. From the command prompt change to the + <boost>/libs/regex/build directory and type: +
+make -fgcc.mak+
At the end of the build process you should have a gcc sub-directory containing + release and debug versions of the library (libboost_regex.a and + libboost_regex_debug.a). When you build projects that use regex++, you will + need to add the boost install directory to your list of include paths and add + <boost>/libs/regex/build/gcc/libboost_regex.a to your list of library + files. +
+There is also a makefile to build the library as a shared library:
+make -fgcc-shared.mak+
which will build libboost_regex.so and libboost_regex_debug.so.
+Both of the these makefiles support the following environment variables:
+ICU_PATH: tells the makefile to build with Unicode support, set to the path
+ where your ICU installation is located, for example with: make
+ ICU_PATH=/usr/local install -fgcc.mak
CXXFLAGS: extra compiler options - note that this applies to both the debug and + release builds.
+INCLUDES: additional include directories.
+LDFLAGS: additional linker options.
+LIBS: additional library files.
+For the more adventurous there is a configure script in + <boost>/libs/config; see the config + library documentation.
+There is a makefile for the sun (6.1) compiler (C++ version 3.12). From the + command prompt change to the <boost>/libs/regex/build directory and type: +
+dmake -f sunpro.mak+
At the end of the build process you should have a sunpro sub-directory + containing single and multithread versions of the library (libboost_regex.a, + libboost_regex.so, libboost_regex_mt.a and libboost_regex_mt.so). When you + build projects that use regex++, you will need to add the boost install + directory to your list of include paths and add + <boost>/libs/regex/build/sunpro/ to your library search path. +
+Both of the these makefiles support the following environment variables:
+CXXFLAGS: extra compiler options - note that this applies to both the single + and multithreaded builds.
+INCLUDES: additional include directories.
+LDFLAGS: additional linker options.
+LIBS: additional library files.
+LIBSUFFIX: a suffix to mangle the library name with (defaults to nothing).
+This makefile does not set any architecture specific options like -xarch=v9, + you can set these by defining the appropriate macros, for example:
+dmake CXXFLAGS="-xarch=v9" LDFLAGS="-xarch=v9" LIBSUFFIX="_v9" -f sunpro.mak+
will build v9 variants of the regex library named libboost_regex_v9.a etc.
+There is a generic makefile (generic.mak ) + provided in <boost-root>/libs/regex/build - see that makefile for details + of environment variables that need to be set before use. +
Revised + + 09 Jan 2005 +
+© Copyright John Maddock 1998- + 2005
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/introduction.html b/doc/introduction.html new file mode 100644 index 00000000..3a4fcc7c --- /dev/null +++ b/doc/introduction.html @@ -0,0 +1,181 @@ + + + ++
+ |
+
+ Boost.Regex+Introduction+ |
+
+ |
+
Regular expressions are a form of pattern-matching that are often used in text + processing; many users will be familiar with the Unix utilities grep, sed + and awk, and the programming language Perl, each of which make + extensive use of regular expressions. Traditionally C++ users have been limited + to the POSIX C API's for manipulating regular expressions, and while regex++ + does provide these API's, they do not represent the best way to use the + library. For example regex++ can cope with wide character strings, or search + and replace operations (in a manner analogous to either sed or Perl), something + that traditional C libraries can not do.
+The class boost::basic_regex is the key class in + this library; it represents a "machine readable" regular expression, and is + very closely modeled on std::basic_string, think of it as a string plus the + actual state-machine required by the regular expression algorithms. Like + std::basic_string there are two typedefs that are almost always the means by + which this class is referenced:
+namespace boost{ + +template <class charT, + class traits = regex_traits<charT> > +class basic_regex; + +typedef basic_regex<char> regex; +typedef basic_regex<wchar_t> wregex; + +}+
To see how this library can be used, imagine that we are writing a credit card + processing application. Credit card numbers generally come as a string of + 16-digits, separated into groups of 4-digits, and separated by either a space + or a hyphen. Before storing a credit card number in a database (not necessarily + something your customers will appreciate!), we may want to verify that the + number is in the correct format. To match any digit we could use the regular + expression [0-9], however ranges of characters like this are actually locale + dependent. Instead we should use the POSIX standard form [[:digit:]], or the + regex++ and Perl shorthand for this \d (note that many older libraries tended + to be hard-coded to the C-locale, consequently this was not an issue for them). + That leaves us with the following regular expression to validate credit card + number formats:
+(\d{4}[- ]){3}\d{4}+
Here the parenthesis act to group (and mark for future reference) + sub-expressions, and the {4} means "repeat exactly 4 times". This is an example + of the extended regular expression syntax used by Perl, awk and egrep. Regex++ + also supports the older "basic" syntax used by sed and grep, but this is + generally less useful, unless you already have some basic regular expressions + that you need to reuse.
+Now let's take that expression and place it in some C++ code to validate the + format of a credit card number:
+bool validate_card_format(const std::string& s) +{ + static const boost::regex e("(\\d{4}[- ]){3}\\d{4}"); + return regex_match(s, e); +}+
Note how we had to add some extra escapes to the expression: remember that the + escape is seen once by the C++ compiler, before it gets to be seen by the + regular expression engine, consequently escapes in regular expressions have to + be doubled up when embedding them in C/C++ code. Also note that all the + examples assume that your compiler supports Koenig lookup, if yours doesn't + (for example VC6), then you will have to add some boost:: prefixes to some of + the function calls in the examples.
+Those of you who are familiar with credit card processing, will have realized + that while the format used above is suitable for human readable card numbers, + it does not represent the format required by online credit card systems; these + require the number as a string of 16 (or possibly 15) digits, without any + intervening spaces. What we need is a means to convert easily between the two + formats, and this is where search and replace comes in. Those who are familiar + with the utilities sed and Perl will already be ahead here; we + need two strings - one a regular expression - the other a "format + string" that provides a description of the text to replace the match + with. In regex++ this search and replace operation is performed with the + algorithm regex_replace, for our credit card + example we can write two algorithms like this to provide the format + conversions:
+// match any format with the regular expression: +const boost::regex e("\\A(\\d{3,4})[- ]?(\\d{4})[- ]?(\\d{4})[- ]?(\\d{4})\\z"); +const std::string machine_format("\\1\\2\\3\\4"); +const std::string human_format("\\1-\\2-\\3-\\4"); + +std::string machine_readable_card_number(const std::string s) +{ + return regex_replace(s, e, machine_format, boost::match_default | boost::format_sed); +} + +std::string human_readable_card_number(const std::string s) +{ + return regex_replace(s, e, human_format, boost::match_default | boost::format_sed); +}+
Here we've used marked sub-expressions in the regular expression to split out + the four parts of the card number as separate fields, the format string then + uses the sed-like syntax to replace the matched text with the reformatted + version.
+In the examples above, we haven't directly manipulated the results of a regular + expression match, however in general the result of a match contains a number of + sub-expression matches in addition to the overall match. When the library needs + to report a regular expression match it does so using an instance of the class + match_results, as before there are typedefs of this class for the most + common cases: +
+namespace boost{ +typedef match_results<const char*> cmatch; +typedef match_results<const wchar_t*> wcmatch; +typedef match_results<std::string::const_iterator> smatch; +typedef match_results<std::wstring::const_iterator> wsmatch; +}+
The algorithms regex_search and regex_match + make use of match_results to report what matched; the difference between these + algorithms is that regex_match will only find + matches that consume all of the input text, where as + regex_search will search for a match anywhere within the text + being matched.
+Note that these algorithms are not restricted to searching regular C-strings, + any bidirectional iterator type can be searched, allowing for the possibility + of seamlessly searching almost any kind of data. +
+For search and replace operations, in addition to the algorithm + regex_replace that we have already seen, the match_results + class has a format member that takes the result of a match and a format string, + and produces a new string by merging the two.
+For iterating through all occurences of an expression within a text, there are + two iterator types: regex_iterator will + enumerate over the match_results objects + found, while regex_token_iterator will + enumerate a series of strings (similar to perl style split operations).
+For those that dislike templates, there is a high level wrapper class RegEx + that is an encapsulation of the lower level template code - it provides a + simplified interface for those that don't need the full power of the library, + and supports only narrow characters, and the "extended" regular expression + syntax. This class is now deprecated as it does not form part of the regular + expressions C++ standard library proposal. +
+The POSIX API functions: regcomp, regexec, regfree + and regerror, are available in both narrow character and Unicode versions, and + are provided for those who need compatibility with these API's. +
+Finally, note that the library now has run-time localization + support, and recognizes the full POSIX regular expression syntax - including + advanced features like multi-character collating elements and equivalence + classes - as well as providing compatibility with other regular expression + libraries including GNU and BSD4 regex packages, and to a more limited extent + Perl 5. +
++
Revised + + 24 Oct 2003 +
+© Copyright John Maddock 1998- + + 2003
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + + diff --git a/doc/localisation.html b/doc/localisation.html new file mode 100644 index 00000000..31bd8663 --- /dev/null +++ b/doc/localisation.html @@ -0,0 +1,808 @@ + + + +
+ |
+
+ Boost.Regex+Localisation+ |
+
+ |
+
Boost.regex provides extensive support for run-time localization, the + localization model used can be split into two parts: front-end and back-end.
+Front-end localization deals with everything which the user sees - error + messages, and the regular expression syntax itself. For example a French + application could change [[:word:]] to [[:mot:]] and \w to \m. Modifying the + front end locale requires active support from the developer, by providing the + library with a message catalogue to load, containing the localized strings. + Front-end locale is affected by the LC_MESSAGES category only.
+Back-end localization deals with everything that occurs after the expression + has been parsed - in other words everything that the user does not see or + interact with directly. It deals with case conversion, collation, and character + class membership. The back-end locale does not require any intervention from + the developer - the library will acquire all the information it requires for + the current locale from the underlying operating system / run time library. + This means that if the program user does not interact with regular expressions + directly - for example if the expressions are embedded in your C++ code - then + no explicit localization is required, as the library will take care of + everything for you. For example embedding the expression [[:word:]]+ in your + code will always match a whole word, if the program is run on a machine with, + for example, a Greek locale, then it will still match a whole word, but in + Greek characters rather than Latin ones. The back-end locale is affected by the + LC_TYPE and LC_COLLATE categories.
+There are three separate localization mechanisms supported by boost.regex:
+This is the default model when the library is compiled under Win32, and is + encapsulated by the traits class w32_regex_traits. When this model is in effect + each basic_regex object gets it's own LCID, by default this is the users + default setting as returned by GetUserDefaultLCID, but you can call imbue + on the basic_regex object to set it's locale to some other LCID if you wish. + All the settings used by boost.regex are acquired directly from the operating + system bypassing the C run time library. Front-end localization requires a + resource dll, containing a string table with the user-defined strings. The + traits class exports the function:
+static std::string set_message_catalogue(const std::string& s);
+which needs to be called with a string identifying the name of the resource + dll, before your code compiles any regular expressions (but not + necessarily before you construct any basic_regex instances):
++ boost::w32_regex_traits<char>::set_message_catalogue("mydll.dll");
++ The library provides full Unicode support under NT, under Windows 9x the + library degrades gracefully - characters 0 to 255 are supported, the remainder + are treated as "unknown" graphic characters.
+This model has been deprecated in favor of the C++ localoe for all non-Windows + compilers that support it. This locale is encapsulated by the traits + class c_regex_traits, Win32 users can force this model to take effect by + defining the pre-processor symbol BOOST_REGEX_USE_C_LOCALE. When this model is + in effect there is a single global locale, as set by setlocale. All + settings are acquired from your run time library, consequently Unicode support + is dependent upon your run time library implementation.
+Front end localization is not supported.
+Note that calling setlocale invalidates all compiled regular + expressions, calling setlocale(LC_ALL, "C") will make this library + behave equivalent to most traditional regular expression libraries including + version 1 of this library.
+This model is the default for non-Windows compilers.
++ When this model is in effect each instance of basic_regex<> has its own + instance of std::locale, class basic_regex<> also has a member function imbue + which allows the locale for the expression to be set on a per-instance basis. + Front end localization requires a POSIX message catalogue, which will be loaded + via the std::messages facet of the expression's locale, the traits class + exports the symbol:
+static std::string set_message_catalogue(const std::string& s);
+which needs to be called with a string identifying the name of the message + catalogue, before your code compiles any regular expressions (but not + necessarily before you construct any basic_regex instances):
++ boost::cpp_regex_traits<char>::set_message_catalogue("mycatalogue");
+Note that calling basic_regex<>::imbue will invalidate any expression + currently compiled in that instance of basic_regex<>.
+Finally note that if you build the library with a non-default localization + model, then the appropriate pre-processor symbol (BOOST_REGEX_USE_C_LOCALE or + BOOST_REGEX_USE_CPP_LOCALE) must be defined both when you build the support + library, and when you include <boost/regex.hpp> or + <boost/cregex.hpp> in your code. The best way to ensure this is to add + the #define to <boost/regex/user.hpp>.
+
+ In order to localize the front end of the library, you need to provide the
+ library with the appropriate message strings contained either in a resource
+ dll's string table (Win32 model), or a POSIX message catalogue (C++ models). In
+ the latter case the messages must appear in message set zero of the catalogue.
+ The messages and their id's are as follows:
+
+ | Message id | +Meaning | +Default value | ++ |
+ | 101 | +The character used to start a sub-expression. | +"(" | ++ |
+ | 102 | +The character used to end a sub-expression + declaration. | +")" | ++ |
+ | 103 | +The character used to denote an end of line + assertion. | +"$" | ++ |
+ | 104 | +The character used to denote the start of line + assertion. | +"^" | ++ |
+ | 105 | +The character used to denote the "match any character + expression". | +"." | ++ |
+ | 106 | +The match zero or more times repetition operator. | +"*" | ++ |
+ | 107 | +The match one or more repetition operator. | +"+" | ++ |
+ | 108 | +The match zero or one repetition operator. | +"?" | ++ |
+ | 109 | +The character set opening character. | +"[" | ++ |
+ | 110 | +The character set closing character. | +"]" | ++ |
+ | 111 | +The alternation operator. | +"|" | ++ |
+ | 112 | +The escape character. | +"\\" | ++ |
+ | 113 | +The hash character (not currently used). | +"#" | ++ |
+ | 114 | +The range operator. | +"-" | ++ |
+ | 115 | +The repetition operator opening character. | +"{" | ++ |
+ | 116 | +The repetition operator closing character. | +"}" | ++ |
+ | 117 | +The digit characters. | +"0123456789" | ++ |
+ | 118 | +The character which when preceded by an escape + character represents the word boundary assertion. | +"b" | ++ |
+ | 119 | +The character which when preceded by an escape + character represents the non-word boundary assertion. | +"B" | ++ |
+ | 120 | +The character which when preceded by an escape + character represents the word-start boundary assertion. | +"<" | ++ |
+ | 121 | +The character which when preceded by an escape + character represents the word-end boundary assertion. | +">" | ++ |
+ | 122 | +The character which when preceded by an escape + character represents any word character. | +"w" | ++ |
+ | 123 | +The character which when preceded by an escape + character represents a non-word character. | +"W" | ++ |
+ | 124 | +The character which when preceded by an escape + character represents a start of buffer assertion. | +"`A" | ++ |
+ | 125 | +The character which when preceded by an escape + character represents an end of buffer assertion. | +"'z" | ++ |
+ | 126 | +The newline character. | +"\n" | ++ |
+ | 127 | +The comma separator. | +"," | ++ |
+ | 128 | +The character which when preceded by an escape + character represents the bell character. | +"a" | ++ |
+ | 129 | +The character which when preceded by an escape + character represents the form feed character. | +"f" | ++ |
+ | 130 | +The character which when preceded by an escape + character represents the newline character. | +"n" | ++ |
+ | 131 | +The character which when preceded by an escape + character represents the carriage return character. | +"r" | ++ |
+ | 132 | +The character which when preceded by an escape + character represents the tab character. | +"t" | ++ |
+ | 133 | +The character which when preceded by an escape + character represents the vertical tab character. | +"v" | ++ |
+ | 134 | +The character which when preceded by an escape + character represents the start of a hexadecimal character constant. | +"x" | ++ |
+ | 135 | +The character which when preceded by an escape + character represents the start of an ASCII escape character. | +"c" | ++ |
+ | 136 | +The colon character. | +":" | ++ |
+ | 137 | +The equals character. | +"=" | ++ |
+ | 138 | +The character which when preceded by an escape + character represents the ASCII escape character. | +"e" | ++ |
+ | 139 | +The character which when preceded by an escape + character represents any lower case character. | +"l" | ++ |
+ | 140 | +The character which when preceded by an escape + character represents any non-lower case character. | +"L" | ++ |
+ | 141 | +The character which when preceded by an escape + character represents any upper case character. | +"u" | ++ |
+ | 142 | +The character which when preceded by an escape + character represents any non-upper case character. | +"U" | ++ |
+ | 143 | +The character which when preceded by an escape + character represents any space character. | +"s" | ++ |
+ | 144 | +The character which when preceded by an escape + character represents any non-space character. | +"S" | ++ |
+ | 145 | +The character which when preceded by an escape + character represents any digit character. | +"d" | ++ |
+ | 146 | +The character which when preceded by an escape + character represents any non-digit character. | +"D" | ++ |
+ | 147 | +The character which when preceded by an escape + character represents the end quote operator. | +"E" | ++ |
+ | 148 | +The character which when preceded by an escape + character represents the start quote operator. | +"Q" | ++ |
+ | 149 | +The character which when preceded by an escape + character represents a Unicode combining character sequence. | +"X" | ++ |
+ | 150 | +The character which when preceded by an escape + character represents any single character. | +"C" | ++ |
+ | 151 | +The character which when preceded by an escape + character represents end of buffer operator. | +"Z" | ++ |
+ | 152 | +The character which when preceded by an escape + character represents the continuation assertion. | +"G" | ++ |
+ | 153 | +The character which when preceeded by (? indicates a zero width negated + forward lookahead assert. | +! | ++ |
Custom error messages are loaded as follows:
+ ++ | Message ID | +Error message ID | +Default string | ++ |
+ | 201 | +REG_NOMATCH | +"No match" | ++ |
+ | 202 | +REG_BADPAT | +"Invalid regular expression" | ++ |
+ | 203 | +REG_ECOLLATE | +"Invalid collation character" | ++ |
+ | 204 | +REG_ECTYPE | +"Invalid character class name" | ++ |
+ | 205 | +REG_EESCAPE | +"Trailing backslash" | ++ |
+ | 206 | +REG_ESUBREG | +"Invalid back reference" | ++ |
+ | 207 | +REG_EBRACK | +"Unmatched [ or [^" | ++ |
+ | 208 | +REG_EPAREN | +"Unmatched ( or \\(" | ++ |
+ | 209 | +REG_EBRACE | +"Unmatched \\{" | ++ |
+ | 210 | +REG_BADBR | +"Invalid content of \\{\\}" | ++ |
+ | 211 | +REG_ERANGE | +"Invalid range end" | ++ |
+ | 212 | +REG_ESPACE | +"Memory exhausted" | ++ |
+ | 213 | +REG_BADRPT | +"Invalid preceding regular expression" | ++ |
+ | 214 | +REG_EEND | +"Premature end of regular expression" | ++ |
+ | 215 | +REG_ESIZE | +"Regular expression too big" | ++ |
+ | 216 | +REG_ERPAREN | +"Unmatched ) or \\)" | ++ |
+ | 217 | +REG_EMPTY | +"Empty expression" | ++ |
+ | 218 | +REG_E_UNKNOWN | +"Unknown error" | ++ |
Custom character class names are loaded as followed:
+ ++ | Message ID | +Description | +Equivalent default class name | ++ |
+ | 300 | +The character class name for alphanumeric characters. | +"alnum" | ++ |
+ | 301 | +The character class name for alphabetic characters. | +"alpha" | ++ |
+ | 302 | +The character class name for control characters. | +"cntrl" | ++ |
+ | 303 | +The character class name for digit characters. | +"digit" | ++ |
+ | 304 | +The character class name for graphics characters. | +"graph" | ++ |
+ | 305 | +The character class name for lower case characters. | +"lower" | ++ |
+ | 306 | +The character class name for printable characters. | +"print" | ++ |
+ | 307 | +The character class name for punctuation characters. | +"punct" | ++ |
+ | 308 | +The character class name for space characters. | +"space" | ++ |
+ | 309 | +The character class name for upper case characters. | +"upper" | ++ |
+ | 310 | +The character class name for hexadecimal characters. | +"xdigit" | ++ |
+ | 311 | +The character class name for blank characters. | +"blank" | ++ |
+ | 312 | +The character class name for word characters. | +"word" | ++ |
+ | 313 | +The character class name for Unicode characters. | +"unicode" | ++ |
Finally, custom collating element names are loaded starting from message id + 400, and terminating when the first load thereafter fails. Each message looks + something like: "tagname string" where tagname is the name used inside + [[.tagname.]] and string is the actual text of the collating element. + Note that the value of collating element [[.zero.]] is used for the conversion + of strings to numbers - if you replace this with another value then that will + be used for string parsing - for example use the Unicode character 0x0660 for + [[.zero.]] if you want to use Unicode Arabic-Indic digits in your regular + expressions in place of Latin digits.
+Note that the POSIX defined names for character classes and collating elements + are always available - even if custom names are defined, in contrast, custom + error messages, and custom syntax messages replace the default ones.
+ +Revised + + 26 June 2004 +
+© Copyright John Maddock 1998- + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/match_flag_type.html b/doc/match_flag_type.html new file mode 100644 index 00000000..64f61402 --- /dev/null +++ b/doc/match_flag_type.html @@ -0,0 +1,295 @@ + + + +
+ |
+
+ Boost.Regex+match_flag_type+ |
+
+ |
+
The type match_flag_type
is an implementation specific bitmask
+ type (17.3.2.1.2) that controls how a regular expression is matched against a
+ character sequence. The behavior of the format flags is described in more
+ detail in the format syntax guide.
+namespace boost{ namespace regex_constants{ + +typedef implemenation-specific-bitmask-type match_flag_type; + +static const match_flag_type match_default = 0; +static const match_flag_type match_not_bob; +static const match_flag_type match_not_eob; +static const match_flag_type match_not_bol; +static const match_flag_type match_not_eol; +static const match_flag_type match_not_bow; +static const match_flag_type match_not_eow; +static const match_flag_type match_any; +static const match_flag_type match_not_null; +static const match_flag_type match_continuous; +static const match_flag_type match_partial; +static const match_flag_type match_single_line; +static const match_flag_type match_prev_avail; +static const match_flag_type match_not_dot_newline; +static const match_flag_type match_not_dot_null; + +static const match_flag_type format_default = 0; +static const match_flag_type format_sed; +static const match_flag_type format_perl; +static const match_flag_type format_literal;+
+static const match_flag_type format_no_copy; +static const match_flag_type format_first_only; +static const match_flag_type format_all; + +} // namespace regex_constants +} // namespace boost +
The type match_flag_type
is an implementation specific bitmask
+ type (17.3.2.1.2). When matching a regular expression against a sequence of
+ characters [first, last) then setting its elements has the effects listed in
+ the table below:
+ Element + |
+
+ Effect if set + |
+
+ match_default + |
+
+ Specifies that matching of regular expressions proceeds without any + modification of the normal rules used in ECMA-262, ECMAScript Language + Specification, Chapter 15 part 10, RegExp (Regular Expression) Objects (FWD.1) + |
+
match_not_bob | +Specifies that the expressions "\A" and + "\`" should not match against the sub-sequence [first,first). | +
match_not_eob | +Specifies that the expressions "\'", "\z" and + "\Z" should not match against the sub-sequence [last,last). | +
+ match_not_bol + |
+
+ Specifies that the expression "^" should not be matched against the + sub-sequence [first,first). + |
+
+ match_not_eol + |
+
+ Specifies that the expression "$" should not be matched against the + sub-sequence [last,last). + |
+
+ match_not_bow + |
+
+ Specifies that the expressions "\<" and "\b" should not be matched + against the sub-sequence [first,first). + |
+
+ match_not_eow + |
+
+ Specifies that the expressions "\>" and "\b" should not be matched + against the sub-sequence [last,last). + |
+
+ match_any + |
+
+ Specifies that if more than one match is possible then any match is an + acceptable result: this will still find the leftmost match, but may not find + the "best" match at that position. Use this flag if you care about the + speed of matching, but don't care what was matched (only whether there is one + or not). + |
+
+ match_not_null + |
+
+ Specifies that the expression can not be matched against an empty sequence. + |
+
+ match_continuous + |
+
+ Specifies that the expression must match a sub-sequence that begins at first. + |
+
+ match_partial + |
+
+ Specifies that if no match can be found, then it is acceptable to return a + match [from, last) such that from!= last, if there could exist some longer + sequence of characters [from,to) of which [from,last) is a prefix, and which + would result in a full match. +This flag is used when matching incomplete or very long texts, see the + partial matches documentation for more information. + |
+
match_extra | +Instructs the matching engine to retain all available + capture information; if a capturing group is repeated then information + about every repeat is available via match_results::captures() + or sub_match_captures(). | +
match_single_line | +Equivalent to the inverse of Perl's m/ modifier; + prevents ^ from matching after an embedded newline character (so that it only + matches at the start of the text being matched), and $ from matching before an + embedded newline (so that it only matches at the end of the text being + matched). | +
+ match_prev_avail + |
+
+ Specifies that |
+
match_not_dot_newline | +Specifies that the expression "." does not match a + newline character. This is the inverse of Perl's s/ modifier. | +
match_not_dot_null | +Specified that the expression "." does not match a + character null '\0'. | +
+ format_default + |
+
+ Specifies that when a regular expression match is to be replaced by a new + string, that the new string is constructed using the rules used by the + ECMAScript replace function in ECMA-262, ECMAScript Language Specification, + Chapter 15 part 5.4.11 String.prototype.replace. (FWD.1). In addition during + search and replace operations then all non-overlapping occurrences of the + regular expression are located and replaced, and sections of the input that did + not match the expression, are copied unchanged to the output string. + |
+
+ format_sed + |
+
+ Specifies that when a regular expression match is to be replaced by a new + string, that the new string is constructed using the rules used by the Unix sed + utility in IEEE Std 1003.1-2001, Portable Operating SystemInterface (POSIX ), + Shells and Utilities.. + |
+
+ format_perl + |
+
+ + Specifies that when a regular expression match is to be replaced by a new + string, that the new string is constructed using the same rules as Perl 5. + |
+
format_literal | +Specified that when a regular expression match is to + be replaced by a new string, that the new string is a literal copy of the + replacement text. | +
format_all | +Specifies that all syntax extensions are + enabled, including conditional (?ddexpression1:expression2) replacements: see + the format string guide for more details. | +
+ format_no_copy + |
+
+ When specified during a search and replace operation, then sections of the + character container sequence being searched that do match the regular + expression, are not copied to the output string. + |
+
+ format_first_only + |
+
+ When specified during a search and replace operation, then only the first + occurrence of the regular expression is replaced. + |
+
Revised + + 04 Feb 2004 +
+© Copyright John Maddock 1998- + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/match_results.html b/doc/match_results.html new file mode 100644 index 00000000..6634b90a --- /dev/null +++ b/doc/match_results.html @@ -0,0 +1,459 @@ + + + +
+ |
+
+ Boost.Regex+class match_results+ |
+
+ |
+
#include <boost/regex.hpp>
+Regular expressions are different from many simple pattern-matching algorithms + in that as well as finding an overall match they can also produce + sub-expression matches: each sub-expression being delimited in the pattern by a + pair of parenthesis (...). There has to be some method for reporting + sub-expression matches back to the user: this is achieved this by defining a + class match_results that acts as an indexed collection of sub-expression + matches, each sub-expression match being contained in an object of type + sub_match .
+Template class match_results denotes a collection of character sequences + representing the result of a regular expression match. Objects of type + match_results are passed to the algorithms regex_match + and regex_search, and are returned by the + iterator regex_iterator . Storage for + the collection is allocated and freed as necessary by the member functions of + class match_results.
+The template class match_results conforms to the requirements of a Sequence, as + specified in (lib.sequence.reqmts), except that only operations defined for + const-qualified Sequences are supported.
+Class template match_results is most commonly used as one of the typedefs + cmatch, wcmatch, smatch, or wsmatch:
+template <class BidirectionalIterator, + class Allocator = std::allocator<sub_match<BidirectionalIterator> > +class match_results; + +typedef match_results<const char*> cmatch; +typedef match_results<const wchar_t*> wcmatch; +typedef match_results<string::const_iterator> smatch; +typedef match_results<wstring::const_iterator> wsmatch; + +template <class BidirectionalIterator, + class Allocator = std::allocator<sub_match<BidirectionalIterator> > +class match_results +{ +public: + typedef sub_match<BidirectionalIterator> value_type; + typedef const value_type& const_reference; + typedef const_reference reference; + typedef implementation defined const_iterator; + typedef const_iterator iterator; + typedef typename iterator_traits<BidirectionalIterator>::difference_type difference_type; + typedef typename Allocator::size_type size_type; + typedef Allocator allocator_type; + typedef typename iterator_traits<BidirectionalIterator>::value_type char_type; + typedef basic_string<char_type> string_type; + + // construct/copy/destroy: + explicit match_results(const Allocator& a = Allocator()); + match_results(const match_results& m); + match_results& operator=(const match_results& m); + ~match_results(); + + // size: + size_type size() const; + size_type max_size() const; + bool empty() const; + // element access: + difference_type length(int sub = 0) const; + difference_type position(unsigned int sub = 0) const; + string_type str(int sub = 0) const; + const_reference operator[](int n) const; + + const_reference prefix() const; + + const_reference suffix() const; + const_iterator begin() const; + const_iterator end() const; + // format: + template <class OutputIterator> + OutputIterator format(OutputIterator out, + const string_type& fmt, + match_flag_type flags = format_default) const; + string_type format(const string_type& fmt, + match_flag_type flags = format_default) const; + + allocator_type get_allocator() const; + void swap(match_results& that); + +#ifdef BOOST_REGEX_MATCH_EXTRA + typedef typename value_type::capture_sequence_type capture_sequence_type; + const capture_sequence_type& captures(std::size_t i)const; +#endif + +}; + +template <class BidirectionalIterator, class Allocator> +bool operator == (const match_results<BidirectionalIterator, Allocator>& m1, + const match_results<BidirectionalIterator, Allocator>& m2); +template <class BidirectionalIterator, class Allocator> +bool operator != (const match_results<BidirectionalIterator, Allocator>& m1, + const match_results<BidirectionalIterator, Allocator>& m2); + +template <class charT, class traits, class BidirectionalIterator, class Allocator> +basic_ostream<charT, traits>& + operator << (basic_ostream<charT, traits>& os, + const match_results<BidirectionalIterator, Allocator>& m); + +template <class BidirectionalIterator, class Allocator> +void swap(match_results<BidirectionalIterator, Allocator>& m1, + match_results<BidirectionalIterator, Allocator>& m2); ++
In all match_results
constructors, a copy of the Allocator
+ argument is used for any memory allocation performed by the constructor or
+ member functions during the lifetime of the object.
+match_results(const Allocator& a = Allocator()); ++ +
Effects: Constructs an object of class match_results. The postconditions + of this function are indicated in the table:
+ +
+ Element + |
+
+ Value + |
+
+ empty() + |
+
+ true + |
+
+ size() + |
+
+ 0 + |
+
+ str() + |
+
+ basic_string<charT>() + |
+
+
+match_results(const match_results& m); ++ +
Effects: Constructs an object of class match_results, as a copy of m.
++match_results& operator=(const match_results& m); ++ +
Effects: Assigns m to *this. The postconditions of this function are + indicated in the table:
+ +
+ Element + |
+
+ Value + |
+
+ empty() + |
+
+ m.empty(). + |
+
+ size() + |
+
+ m.size(). + |
+
+ str(n) + |
+
+ m.str(n) for all integers n < m.size(). + |
+
+ prefix() + |
+
+ m.prefix(). + |
+
+ suffix() + |
+
+ m.suffix(). + |
+
+ (*this)[n] + |
+
+ m[n] for all integers n < m.size(). + |
+
+ length(n) + |
+
+ m.length(n) for all integers n < m.size(). + |
+
+ position(n) + |
+
+ m.position(n) for all integers n < m.size(). + |
+
+size_type size()const; ++ +
Effects: Returns the number of sub_match elements stored in *this; that + is the number of marked sub-expressions in the regular expression that was + matched plus one.
++size_type max_size()const; ++ +
Effects: Returns the maximum number of sub_match elements that can be + stored in *this.
++bool empty()const; ++ +
Effects: Returns size() == 0
.
+difference_type length(int sub = 0)const; ++ +
Effects: Returns the length of sub-expression sub, that is to
+ say: (*this)[sub].length()
.
+difference_type position(unsigned int sub = 0)const; ++ +
Effects: Returns the starting location of sub-expression sub,
+ or -1 if sub was not matched. Note that if this represents a
+ partial match , then position()
will return the location of
+ the partial match even though (*this)[0].matched
is false.
+string_type str(int sub = 0)const; ++ +
Effects: Returns sub-expression sub as a string: string_type((*this)[sub]).
+const_reference operator[](int n) const; ++ +
Effects: Returns a reference to the sub_match
object
+ representing the character sequence that matched marked sub-expression n.
+ If n == 0
then returns a reference to a sub_match
object
+ representing the character sequence that matched the whole regular
+ expression. If n is out of range, or if n is an
+ unmatched sub-expression, then returns a sub_match object whose matched
+ member is false.
+const_reference prefix()const; ++ +
Effects: Returns a reference to the sub_match
object
+ representing the character sequence from the start of the string being
+ matched/searched, to the start of the match found.
+const_reference suffix()const; ++ +
Effects: Returns a reference to the sub_match
object
+ representing the character sequence from the end of the match found to the end
+ of the string being matched/searched.
+const_iterator begin()const; ++ +
Effects: Returns a starting iterator that enumerates over all the marked + sub-expression matches stored in *this.
++const_iterator end()const; ++ +
Effects: Returns a terminating iterator that enumerates over all the + marked sub-expression matches stored in *this.
+template <class OutputIterator> +OutputIterator format(OutputIterator out, + const string_type& fmt, + match_flag_type flags = format_default); ++ +
Requires: The type OutputIterator conforms to the Output Iterator + requirements (24.1.2).
+ +Effects: Copies the character sequence [fmt.begin(), fmt.end()) to + OutputIterator out. For each format specifier or escape sequence in fmt, + replace that sequence with either the character(s) it represents, or the + sequence of characters within *this to which it refers. The bitmasks specified + in flags determines what + format specifiers or escape sequences are recognized, by default this is + the format used by ECMA-262, ECMAScript Language Specification, Chapter 15 part + 5.4.11 String.prototype.replace.
+ +Returns: out.
++string_type format(const string_type& fmt, + match_flag_type flags = format_default); ++ +
Effects: Returns a copy of the string fmt. For each format + specifier or escape sequence in fmt, replace that sequence with either + the character(s) it represents, or the sequence of characters within *this to + which it refers. The bitmasks specified in flags + determines what format specifiers or escape sequences + are recognized, by default this is the format used by ECMA-262, + ECMAScript Language Specification, Chapter 15 part 5.4.11 + String.prototype.replace.
+allocator_type get_allocator()const; ++ +
Effects: Returns a copy of the Allocator that was passed to the object's + constructor.
+void swap(match_results& that); ++ +
Effects: Swaps the contents of the two sequences.
+ +Postcondition: *this
contains the sequence of matched
+ sub-expressions that were in that
, that
contains the
+ sequence of matched sub-expressions that were in *this
.
Complexity: constant time.
+typedef typename value_type::capture_sequence_type capture_sequence_type;+
Defines an implementation-specific type that satisfies the requirements of + a standard library Sequence (21.1.1 including the optional Table 68 + operations), whose value_type is a sub_match<BidirectionalIterator>. This + type happens to be std::vector<sub_match<BidirectionalIterator> >, + but you shouldn't actually rely on that.
+const capture_sequence_type& captures(std::size_t i)const;+
Effects: returns a sequence containing all the captures + obtained for sub-expression i.
+Returns: (*this)[i].captures();
Preconditions: the library must be built and used with + BOOST_REGEX_MATCH_EXTRA defined, and you must pass the flag + match_extra to the regex matching functions (regex_match, + regex_search, regex_iterator + or regex_token_iterator) in order for + this member function to be defined and return useful information.
+Rationale: Enabling this feature has several consequences: +
+template <class BidirectionalIterator, class Allocator> +bool operator == (const match_results<BidirectionalIterator, Allocator>& m1, + const match_results<BidirectionalIterator, Allocator>& m2);+
Effects: Compares the two sequences for equality.
+template <class BidirectionalIterator, class Allocator> +bool operator != (const match_results<BidirectionalIterator, Allocator>& m1, + const match_results<BidirectionalIterator, Allocator>& m2);+
Effects: Compares the two sequences for inequality.
+template <class charT, class traits, class BidirectionalIterator, class Allocator> +basic_ostream<charT, traits>& + operator << (basic_ostream<charT, traits>& os, + const match_results<BidirectionalIterator, Allocator>& m);+
Effects: Writes the contents of m to the stream os as
+ if by calling os << m.str();
Returns os..
template <class BidirectionalIterator, class Allocator> +void swap(match_results<BidirectionalIterator, Allocator>& m1, + match_results<BidirectionalIterator, Allocator>& m2);+
Effects: Swaps the contents of the two sequences.
+ +Revised + + 24 Oct 2003 +
+© Copyright John Maddock 1998- + 2003
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/mfc_strings.html b/doc/mfc_strings.html new file mode 100644 index 00000000..f1d733b7 --- /dev/null +++ b/doc/mfc_strings.html @@ -0,0 +1,294 @@ + + + ++
+ |
+
+ Boost.Regex+Working With MFC/ATL String Types.+ |
+
+ |
+
The header <boost/regex/mfc.hpp> provides Boost.Regex support for MFC + string types: note that this support requires Visual Studio .NET (Visual C++ 7) + or later, where all of the MFC and ATL string types are based around + the CSimpleStringT class template.
+In the following documentation, whenever you see CSimpleStringT<charT>, + then you can substitute any of the following MFC/ATL types (all of which + inherit from CSimpleStringT):
+CString
+ CStringA
+ CStringW
+ CAtlString
+ CAtlStringA
+ CAtlStringW
+ CStringT<charT,traits>
+ CFixedStringT<charT,N>
+ CSimpleStringT<charT>
The following typedefs are provided for the convenience of those working with + TCHAR's:
+typedef basic_regex<TCHAR> tregex; +typedef match_results<TCHAR const*> tmatch; +typedef regex_iterator<TCHAR const*> tregex_iterator; +typedef regex_token_iterator<TCHAR const*> tregex_token_iterator; ++
If you are working with explicitly narrow or wide characters rather than TCHAR, + then use the regular Boost.Regex types instead.
+The following helper function is available to assist in the creation of a + regular expression from an MFC/ATL string type:
+template <class charT> +basic_regex<charT> + make_regex(const ATL::CSimpleStringT<charT>& s, + ::boost::regex_constants::syntax_option_type f = boost::regex_constants::normal);+
Effects: returns basic_regex<charT>(s.GetString(), + s.GetString() + s.GetLength(), f);
+For each regular expression algorithm that's overloaded for a std::basic_string + argument, there is also one overloaded for the MFC/ATL string types. + These algorithm signatures all look a lot more complex than they actually + are, but for completeness here they are anyway:
+There are two overloads, the first reports what matched in a match_results + structure, the second does not. +
+All the usual caveats for regex_match apply, in + particular the algorithm will only report a successful match if all of the + input text matches the expression, if this isn't what you want then + use regex_search instead.
+template <class charT, class T, class A> +bool regex_match( + const ATL::CSimpleStringT<charT>& s, + match_results<const B*, A>& what, + const basic_regex<charT, T>& e, + boost::regex_constants::match_flag_type f = boost::regex_constants::match_default);+
+
Effects: returns ::boost::regex_match(s.GetString(), + s.GetString() + s.GetLength(), what, e, f);
+Example:
+// +// Extract filename part of a path from a CString and return the result +// as another CString: +// +CString get_filename(const CString& path) +{ + boost::tregex r(__T("(?:\\A|.*\\\\)([^\\\\]+)")); + boost::tmatch what; + if(boost::regex_match(path, what, r)) + { + // extract $1 as a CString: + return CString(what[1].first, what.length(1)); + } + else + { + throw std::runtime_error("Invalid pathname"); + } +} ++
template <class charT, class T> +bool regex_match( + const ATL::CSimpleStringT<charT>& s, + const basic_regex<B, T>& e, + boost::regex_constants::match_flag_type f = boost::regex_constants::match_default)+
+
Effects: returns ::boost::regex_match(s.GetString(), + s.GetString() + s.GetLength(), e, f);
+Example:
+// +// Find out if *password* meets our password requirements, +// as defined by the regular expression *requirements*. +// +bool is_valid_password(const CString& password, const CString& requirements) +{ + return boost::regex_match(password, boost::make_regex(requirements)); +}+
There are two additional overloads for regex_search, + the first reports what matched the second does not:
+template <class charT, class A, class T> +bool regex_search(const ATL::CSimpleStringT<charT>& s, + match_results<const charT*, A>& what, + const basic_regex<charT, T>& e, + boost::regex_constants::match_flag_type f = boost::regex_constants::match_default)+
Effects: returns ::boost::regex_search(s.GetString(), + s.GetString() + s.GetLength(), what, e, f);
+Example:: Postcode extraction from an address string.
+CString extract_postcode(const CString& address) +{ + // searches throw address for a UK postcode and returns the result, + // the expression used is by Phil A. on www.regxlib.com: + boost::tregex r(__T("^(([A-Z]{1,2}[0-9]{1,2})|([A-Z]{1,2}[0-9][A-Z]))\\s?([0-9][A-Z]{2})$")); + boost::tmatch what; + if(boost::regex_search(address, what, r)) + { + // extract $0 as a CString: + return CString(what[0].first, what.length()); + } + else + { + throw std::runtime_error("No postcode found"); + } +}+
template <class charT, class T> +inline bool regex_search(const ATL::CSimpleStringT<charT>& s, + const basic_regex<charT, T>& e, + boost::regex_constants::match_flag_type f = boost::regex_constants::match_default) ++
Effects: returns ::boost::regex_search(s.GetString(), + s.GetString() + s.GetLength(), e, f);
+There are two additional overloads for regex_replace, + the first sends output to an output iterator, while the second creates a new + string
+template <class OutputIterator, class BidirectionalIterator, class traits, class + charT> +OutputIterator regex_replace(OutputIterator out, + BidirectionalIterator first, + BidirectionalIterator last, + const basic_regex<charT, traits>& e, + const ATL::CSimpleStringT<charT>& fmt, + match_flag_type flags = match_default) ++
Effects: returns ::boost::regex_replace(out, + first, last, e, fmt.GetString(), flags);
+template <class traits, charT> +ATL::CSimpleStringT<charT> regex_replace(const ATL::CSimpleStringT<charT>& s, + const basic_regex<charT, traits>& e, + const ATL::CSimpleStringT<charT>& fmt, + match_flag_type flags = match_default)+
Effects: returns a new string created using + regex_replace, and the same memory manager as string s.
+Example:
+// +// Take a credit card number as a string of digits, +// and reformat it as a human readable string with "-" +// separating each group of four digits: +// +const boost::tregex e(__T("\\A(\\d{3,4})[- ]?(\\d{4})[- ]?(\\d{4})[- ]?(\\d{4})\\z")); +const CString human_format = __T("$1-$2-$3-$4"); + +CString human_readable_card_number(const CString& s) +{ + return boost::regex_replace(s, e, human_format); +} ++
The following helper functions are provided to ease the conversion from an + MFC/ATL string to a regex_iterator or + regex_token_iterator:
+template <class charT> +regex_iterator<charT const*> + make_regex_iterator( + const ATL::CSimpleStringT<charT>& s, + const basic_regex<charT>& e, + ::boost::regex_constants::match_flag_type f = boost::regex_constants::match_default); ++
Effects:returns regex_iterator(s.GetString(), + s.GetString() + s.GetLength(), e, f);
+Example:
+void enumerate_links(const CString& html) +{ + // enumerate and print all the links in some HTML text, + // the expression used is by Andew Lee on www.regxlib.com: + boost::tregex r(__T("href=[\"\']((http:\\/\\/|\\.\\/|\\/)?\\w+(\\.\\w+)*(\\/\\w+(\\.\\w+)?)*(\\/|\\?\\w*=\\w*(&\\w*=\\w*)*)?)[\"\']")); + boost::tregex_iterator i(boost::make_regex_iterator(html, r)), j; + while(i != j) + { + std::cout << (*i)[1] << std::endl; + ++i; + } +} ++
template <class charT> +regex_token_iterator<charT const*> + make_regex_token_iterator( + const ATL::CSimpleStringT<charT>& s, + const basic_regex<charT>& e, + int sub = 0, + ::boost::regex_constants::match_flag_type f = boost::regex_constants::match_default); ++
Effects:returns regex_token_iterator(s.GetString(), + s.GetString() + s.GetLength(), e, sub, f);
+template <class charT> +regex_token_iterator<charT const*> + make_regex_token_iterator( + const ATL::CSimpleStringT<charT>& s, + const basic_regex<charT>& e, + const std::vector<int>& subs, + ::boost::regex_constants::match_flag_type f = boost::regex_constants::match_default); ++
Effects:returns regex_token_iterator(s.GetString(), + s.GetString() + s.GetLength(), e, subs, f);
+template <class charT, std::size_t N> +regex_token_iterator<charT const*> + make_regex_token_iterator( + const ATL::CSimpleStringT<charT>& s, + const basic_regex<charT>& e, + const int (& subs)[N], + ::boost::regex_constants::match_flag_type f = boost::regex_constants::match_default); ++
Effects: returns regex_token_iterator(s.GetString(), + s.GetString() + s.GetLength(), e, subs, f);
+Example:
+void enumerate_links2(const CString& html) +{ + // enumerate and print all the links in some HTML text, + // the expression used is by Andew Lee on www.regxlib.com: + boost::tregex r(__T("href=[\"\']((http:\\/\\/|\\.\\/|\\/)?\\w+(\\.\\w+)*(\\/\\w+(\\.\\w+)?)*(\\/|\\?\\w*=\\w*(&\\w*=\\w*)*)?)[\"\']")); + boost::tregex_token_iterator i(boost::make_regex_token_iterator(html, r, 1)), j; + while(i != j) + { + std::cout << *i << std::endl; + ++i; + } +}+
Revised + + 21 Dec 2004 +
+© Copyright John Maddock 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/non_standard_strings.html b/doc/non_standard_strings.html new file mode 100644 index 00000000..5196abcf --- /dev/null +++ b/doc/non_standard_strings.html @@ -0,0 +1,53 @@ + + + ++
+ |
+
+ Boost.Regex+Working With Non-Standard String Types.+ |
+
+ |
+
The Boost.Regex algorithms and iterators are all iterator-based, with + convenience overloads of the algorithms provided that convert standard library + string types to iterator pairs internally. If you want to search a + non-standard string type then the trick is to convert that string into an + iterator pair: so far I haven't come across any string types that can't be + handled this way, even if they're not officially iterator based. + Certainly any string type that provides access to it's internal buffer, along + with it's length, can be converted into a pair of pointers (which can be used + as iterators).
+Some non-standard string types are sufficiently common that wappers have been + provided for them:
+MFC/ATL Strings.
+ ICU Strings.
+
Revised + + 24 Nov 2004 +
+© Copyright John Maddock 1998- + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/partial_matches.html b/doc/partial_matches.html new file mode 100644 index 00000000..f523fc9a --- /dev/null +++ b/doc/partial_matches.html @@ -0,0 +1,195 @@ + + + ++
+ |
+
+ Boost.Regex+Partial Matches+ |
+
+ |
+
The match-flag match_partial
can
+ be passed to the following algorithms: regex_match,
+ regex_search, and regex_grep,
+ and used with the iterator regex_iterator.
+ When used it indicates that partial as well as full matches should be found. A
+ partial match is one that matched one or more characters at the end of the text
+ input, but did not match all of the regular expression (although it may have
+ done so had more input been available). Partial matches are typically used when
+ either validating data input (checking each character as it is entered on the
+ keyboard), or when searching texts that are either too long to load into memory
+ (or even into a memory mapped file), or are of indeterminate length (for
+ example the source may be a socket or similar). Partial and full matches can be
+ differentiated as shown in the following table (the variable M represents an
+ instance of match_results<> as filled in
+ by regex_match, regex_search or regex_grep):
+
+
+ | Result | +M[0].matched | +M[0].first | +M[0].second | +
No match | +False | +Undefined | +Undefined | +Undefined | +
Partial match | +True | +False | +Start of partial match. | +End of partial match (end of text). | +
Full match | +True | +True | +Start of full match. | +End of full match. | +
Be aware that using partial matches can sometimes result in somewhat imperfect + behavior:
+The following example
+ tests to see whether the text could be a valid credit card number, as the user
+ presses a key, the character entered would be added to the string being built
+ up, and passed to is_possible_card_number
. If this returns true
+ then the text could be a valid card number, so the user interface's OK button
+ would be enabled. If it returns false, then this is not yet a valid card
+ number, but could be with more input, so the user interface would disable the
+ OK button. Finally, if the procedure throws an exception the input could never
+ become a valid number, and the inputted character must be discarded, and a
+ suitable error indication displayed to the user.
#include <string> +#include <iostream> +#include <boost/regex.hpp> + +boost::regex e("(\\d{3,4})[- ]?(\\d{4})[- ]?(\\d{4})[- ]?(\\d{4})"); + +bool is_possible_card_number(const std::string& input) +{ + // + // return false for partial match, true for full match, or throw for + // impossible match based on what we have so far... + boost::match_results<std::string::const_iterator> what; + if(0 == boost::regex_match(input, what, e, boost::match_default | boost::match_partial)) + { + // the input so far could not possibly be valid so reject it: + throw std::runtime_error("Invalid data entered - this could not possibly be a valid card number"); + } + // OK so far so good, but have we finished? + if(what[0].matched) + { + // excellent, we have a result: + return true; + } + // what we have so far is only a partial match... + return false; +}+
In the following example, + text input is taken from a stream containing an unknown amount of text; this + example simply counts the number of html tags encountered in the stream. The + text is loaded into a buffer and searched a part at a time, if a partial match + was encountered, then the partial match gets searched a second time as the + start of the next batch of text:
+#include <iostream> +#include <fstream> +#include <sstream> +#include <string> +#include <boost/regex.hpp> + +// match some kind of html tag: +boost::regex e("<[^>]*>"); +// count how many: +unsigned int tags = 0; +// saved position of partial match: +char* next_pos = 0; + +bool grep_callback(const boost::match_results<char*>& m) +{ + if(m[0].matched == false) + { + // save position and return: + next_pos = m[0].first; + } + else + ++tags; + return true; +} + +void search(std::istream& is) +{ + char buf[4096]; + next_pos = buf + sizeof(buf); + bool have_more = true; + while(have_more) + { + // how much do we copy forward from last try: + unsigned leftover = (buf + sizeof(buf)) - next_pos; + // and how much is left to fill: + unsigned size = next_pos - buf; + // copy forward whatever we have left: + memcpy(buf, next_pos, leftover); + // fill the rest from the stream: + unsigned read = is.readsome(buf + leftover, size); + // check to see if we've run out of text: + have_more = read == size; + // reset next_pos: + next_pos = buf + sizeof(buf); + // and then grep: + boost::regex_grep(grep_callback, + buf, + buf + read + leftover, + e, + boost::match_default | boost::match_partial); + } +}+
+
Revised + + 24 Oct 2003 +
+© Copyright John Maddock 1998- + 2003
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/performance.html b/doc/performance.html new file mode 100644 index 00000000..c7897ff3 --- /dev/null +++ b/doc/performance.html @@ -0,0 +1,52 @@ + + + ++
+ |
+
+ Boost.Regex+Performance+ |
+
+ |
+
The performance of Boost.regex in both recursive and non-recursive modes should + be broadly comparable to other regular expression libraries: recursive mode is + slightly faster (especially where memory allocation requires thread + synchronisation), but not by much. The following pages compare + Boost.regex with various other regular expression libraries for the following + compilers:
+Visual Studio.Net 2003 (recursive Boost.regex + implementation).
+Gcc 3.2 (cygwin) (non-recursive Boost.regex + implementation).
++
Revised + + 24 Oct 2003 +
+© Copyright John Maddock 1998- + + 2003
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + + diff --git a/doc/posix_api.html b/doc/posix_api.html new file mode 100644 index 00000000..967b8407 --- /dev/null +++ b/doc/posix_api.html @@ -0,0 +1,286 @@ + + + ++
+ |
+
+ Boost.Regex+POSIX API Compatibility Functions+ |
+
+ |
+
#include <boost/cregex.hpp> +or: +#include <boost/regex.h>+
The following functions are available for users who need a POSIX compatible C + library, they are available in both Unicode and narrow character versions, the + standard POSIX API names are macros that expand to one version or the other + depending upon whether UNICODE is defined or not. +
+Important: Note that all the symbols defined here are enclosed inside + namespace boost when used in C++ programs, unless you use #include + <boost/regex.h> instead - in which case the symbols are still defined in + namespace boost, but are made available in the global namespace as well.
+The functions are defined as: +
+extern "C" { +int regcompA(regex_tA*, const char*, int); +unsigned int regerrorA(int, const regex_tA*, char*, unsigned int); +int regexecA(const regex_tA*, const char*, unsigned int, regmatch_t*, int); +void regfreeA(regex_tA*); + +int regcompW(regex_tW*, const wchar_t*, int); +unsigned int regerrorW(int, const regex_tW*, wchar_t*, unsigned int); +int regexecW(const regex_tW*, const wchar_t*, unsigned int, regmatch_t*, int); +void regfreeW(regex_tW*); + +#ifdef UNICODE +#define regcomp regcompW +#define regerror regerrorW +#define regexec regexecW +#define regfree regfreeW +#define regex_t regex_tW +#else +#define regcomp regcompA +#define regerror regerrorA +#define regexec regexecA +#define regfree regfreeA +#define regex_t regex_tA +#endif +}+
All the functions operate on structure regex_t, which exposes two public + members: +
+unsigned int re_nsub this is filled in by regcomp and indicates + the number of sub-expressions contained in the regular expression. +
+const TCHAR* re_endp points to the end of the expression to compile when + the flag REG_PEND is set. +
+Footnote: regex_t is actually a #define - it is either regex_tA or regex_tW + depending upon whether UNICODE is defined or not, TCHAR is either char or + wchar_t again depending upon the macro UNICODE. +
+regcomp takes a pointer to a regex_t, a pointer to the expression
+ to compile and a flags parameter which can be a combination of:
+
+
+
+
+ | REG_EXTENDED | +Compiles modern regular expressions. Equivalent to + regbase::char_classes | regbase::intervals | regbase::bk_refs. | ++ |
+ | REG_BASIC | +Compiles basic (obsolete) regular expression syntax. + Equivalent to regbase::char_classes | regbase::intervals | regbase::limited_ops + | regbase::bk_braces | regbase::bk_parens | regbase::bk_refs. | ++ |
+ | REG_NOSPEC | +All characters are ordinary, the expression is a + literal string. | ++ |
+ | REG_ICASE | +Compiles for matching that ignores character case. | ++ |
+ | REG_NOSUB | +Has no effect in this library. | ++ |
+ | REG_NEWLINE | +When this flag is set a dot does not match the + newline character. | ++ |
+ | REG_PEND | +When this flag is set the re_endp parameter of the + regex_t structure must point to the end of the regular expression to compile. | ++ |
+ | REG_NOCOLLATE | +When this flag is set then locale dependent collation + for character ranges is turned off. | ++ |
+ | REG_ESCAPE_IN_LISTS + , , , + |
+ When this flag is set, then escape sequences are + permitted in bracket expressions (character sets). | ++ |
+ | REG_NEWLINE_ALT | +When this flag is set then the newline character is + equivalent to the alternation operator |. | ++ |
+ | REG_PERL | +Compiles Perl like regular expressions. | ++ |
+ | REG_AWK | +A shortcut for awk-like behavior: REG_EXTENDED | + REG_ESCAPE_IN_LISTS | ++ |
+ | REG_GREP | +A shortcut for grep like behavior: REG_BASIC | + REG_NEWLINE_ALT | ++ |
+ | REG_EGREP | +A shortcut for egrep like behavior: + REG_EXTENDED | REG_NEWLINE_ALT | ++ |
regerror takes the following parameters, it maps an error code to a human
+ readable string:
+
+
+
+ | int code | +The error code. | ++ |
+ | const regex_t* e | +The regular expression (can be null). | ++ |
+ | char* buf | +The buffer to fill in with the error message. | ++ |
+ | unsigned int buf_size | +The length of buf. | ++ |
If the error code is OR'ed with REG_ITOA then the message that results is the + printable name of the code rather than a message, for example "REG_BADPAT". If + the code is REG_ATIO then e must not be null and e->re_pend must + point to the printable name of an error code, the return value is then the + value of the error code. For any other value of code, the return value + is the number of characters in the error message, if the return value is + greater than or equal to buf_size then regerror will have to be + called again with a larger buffer.
+regexec finds the first occurrence of expression e within string buf.
+ If len is non-zero then *m is filled in with what matched the
+ regular expression, m[0] contains what matched the whole string, m[1]
+ the first sub-expression etc, see regmatch_t in the header file
+ declaration for more details. The eflags parameter can be a combination
+ of:
+
+
+
+
+ | REG_NOTBOL | +Parameter buf does not represent the start of + a line. | ++ |
+ | REG_NOTEOL | +Parameter buf does not terminate at the end of + a line. | ++ |
+ | REG_STARTEND | +The string searched starts at buf + pmatch[0].rm_so + and ends at buf + pmatch[0].rm_eo. | ++ |
Finally regfree frees all the memory that was allocated by regcomp. +
+Footnote: this is an abridged reference to the POSIX API functions, it is + provided for compatibility with other libraries, rather than an API to be used + in new code (unless you need access from a language other than C++). This + version of these functions should also happily coexist with other versions, as + the names used are macros that expand to the actual function names. +
+
Revised + + 24 Oct 2003 +
+© Copyright John Maddock 1998- + + 2003
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + + diff --git a/doc/redistributables.html b/doc/redistributables.html new file mode 100644 index 00000000..cdad4739 --- /dev/null +++ b/doc/redistributables.html @@ -0,0 +1,55 @@ + + + ++
+ |
+
+ Boost.Regex+Redistributables and Library Names+ |
+
+ |
+
If you are using Microsoft or Borland C++ and link to a dll version of the run + time library, then you can choose to also link to a dll version of boost.regex + by defining the symbol BOOST_REGEX_DYN_LINK when you compile your code. While + these dll's are redistributable, there are no "standard" versions, so when + installing on the users PC, you should place these in a directory private to + your application, and not in the PC's directory path. Note that if you link to + a static version of your run time library, then you will also link to a static + version of boost.regex and no dll's will need to be distributed. The possible + boost.regex dll and library names are computed according to the + formula given in the getting started guide. +
+Note: you can disable automatic library selection by defining the symbol + BOOST_REGEX_NO_LIB when compiling, this is useful if you want to build + Boost.Regex yourself in your IDE, or if you need to debug boost.regex. +
++
Revised + + 28 June 2004 +
+© Copyright John Maddock 1998- + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/reg_expression.html b/doc/reg_expression.html new file mode 100644 index 00000000..15962278 --- /dev/null +++ b/doc/reg_expression.html @@ -0,0 +1,44 @@ + + + ++
+ |
+
+ Boost.Regex+Class reg_expression (deprecated)+ |
+
+ |
+
The use of class template reg_expression is deprecated: use + basic_regex instead.
++
Revised + + 24 Oct 2003 +
+© Copyright John Maddock 1998- + + 2003
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + + diff --git a/doc/regbase.html b/doc/regbase.html new file mode 100644 index 00000000..e2cf5c54 --- /dev/null +++ b/doc/regbase.html @@ -0,0 +1,82 @@ + + + + +
+ |
+
+Boost.Regex+ +regbase+ |
+
+ |
+
Use of the type boost::regbase
is now deprecated,
+and the type does not form a part of the
+regular expression standardization proposal. This type
+still exists as a base class of boost::basic_regex
,
+and you can still refer to
+boost::regbase::constant_name
in your code, however for
+maximum portability to other std regex implementations you should
+instead use either:
+boost::regex_constants::constant_name ++ +
or
+ ++boost::regex::constant_name ++ +
or
+ ++boost::wregex::constant_name ++ + + +
Revised + + 24 Oct 2003 +
+© Copyright John Maddock 1998- + + 2003
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + + + diff --git a/doc/regex.html b/doc/regex.html new file mode 100644 index 00000000..7a5f29de --- /dev/null +++ b/doc/regex.html @@ -0,0 +1,481 @@ + + + +
+ |
+
+ Boost.Regex+class RegEx (deprecated)+ |
+
+ |
+
The high level wrapper class RegEx is now deprecated and does not form a part + of the regular + expression standardization proposal. This type still exists, and + existing code will continue to compile, however the following documentation is + unlikely to be further updated.
++#include <boost/cregex.hpp> ++
The class RegEx provides a high level simplified interface to the regular + expression library, this class only handles narrow character strings, and + regular expressions always follow the "normal" syntax - that is the same as the + perl / ECMAScript synatx.
++typedef bool (*GrepCallback)(const RegEx& expression); +typedef bool (*GrepFileCallback)(const char* file, const RegEx& expression); +typedef bool (*FindFilesCallback)(const char* file); + +class RegEx +{ +public: + RegEx(); + RegEx(const RegEx& o); + ~RegEx(); + RegEx(const char* c, bool icase = false); + explicit RegEx(const std::string& s, bool icase = false); + RegEx& operator=(const RegEx& o); + RegEx& operator=(const char* p); + RegEx& operator=(const std::string& s); + unsigned int SetExpression(const char* p, bool icase = false); + unsigned int SetExpression(const std::string& s, bool icase = false); + std::string Expression()const; + // + // now matching operators: + // + bool Match(const char* p, boost::match_flag_type flags = match_default); + bool Match(const std::string& s, boost::match_flag_type flags = match_default); + bool Search(const char* p, boost::match_flag_type flags = match_default); + bool Search(const std::string& s, boost::match_flag_type flags = match_default); + unsigned int Grep(GrepCallback cb, const char* p, boost::match_flag_type flags = match_default); + unsigned int Grep(GrepCallback cb, const std::string& s, boost::match_flag_type flags = match_default); + unsigned int Grep(std::vector<std::string>& v, const char* p, boost::match_flag_type flags = match_default); + unsigned int Grep(std::vector<std::string>& v, const std::string& s, boost::match_flag_type flags = match_default); + unsigned int Grep(std::vector<unsigned int>& v, const char* p, boost::match_flag_type flags = match_default); + unsigned int Grep(std::vector<unsigned int>& v, const std::string& s, boost::match_flag_type flags = match_default); + unsigned int GrepFiles(GrepFileCallback cb, const char* files, bool recurse = false, boost::match_flag_type flags = match_default); + unsigned int GrepFiles(GrepFileCallback cb, const std::string& files, bool recurse = false, boost::match_flag_type flags = match_default); + unsigned int FindFiles(FindFilesCallback cb, const char* files, bool recurse = false, boost::match_flag_type flags = match_default); + unsigned int FindFiles(FindFilesCallback cb, const std::string& files, bool recurse = false, boost::match_flag_type flags = match_default); + std::string Merge(const std::string& in, const std::string& fmt, bool copy = true, boost::match_flag_type flags = match_default); + std::string Merge(const char* in, const char* fmt, bool copy = true, boost::match_flag_type flags = match_default); + unsigned Split(std::vector<std::string>& v, std::string& s, boost::match_flag_type flags = match_default, unsigned max_count = ~0); + // + // now operators for returning what matched in more detail: + // + unsigned int Position(int i = 0)const; + unsigned int Length(int i = 0)const; + bool Matched(int i = 0)const; + unsigned int Line()const; + unsigned int Marks() const; + std::string What(int i)const; + std::string operator[](int i)const ; + + static const unsigned int npos; +}; ++
Member functions for class RegEx are defined as follows:
+
+ | RegEx(); | +Default constructor, constructs an instance of RegEx + without any valid expression. | ++ |
+ | RegEx(const RegEx& o); | +Copy constructor, all the properties of parameter o + are copied. | ++ |
+ | RegEx(const char* c, bool icase + = false); | +Constructs an instance of RegEx, setting the + expression to c, if icase is true then matching is + insensitive to case, otherwise it is sensitive to case. Throws bad_expression + on failure. | ++ |
+ | RegEx(const std::string& s, bool icase + = false); | +Constructs an instance of RegEx, setting the + expression to s, if icase is true then matching is + insensitive to case, otherwise it is sensitive to case. Throws bad_expression + on failure. | ++ |
+ | RegEx& operator=(const RegEx& + o); | +Default assignment operator. | ++ |
+ | RegEx& operator=(const char* + p); | +Assignment operator, equivalent to calling SetExpression(p, + false). Throws bad_expression on failure. | ++ |
+ | RegEx& operator=(const std::string& + s); | +Assignment operator, equivalent to calling SetExpression(s, + false). Throws bad_expression on failure. | ++ |
+ | unsigned int SetExpression(constchar* + p, bool icase = false); | +Sets the current expression to p, if icase + is true then matching is insensitive to case, otherwise it is sensitive + to case. Throws bad_expression on failure. | ++ |
+ | unsigned int SetExpression(const + std::string& s, bool icase = false); | +Sets the current expression to s, if icase + is true then matching is insensitive to case, otherwise it is sensitive + to case. Throws bad_expression on failure. | ++ |
+ | std::string Expression()const; | +Returns a copy of the current regular expression. | ++ |
+ | bool Match(const char* p, + boost::match_flag_type flags = match_default); | +Attempts to match the current expression against the + text p using the match flags flags - see + match flags. Returns true if the expression matches the whole of + the input string. | ++ |
+ | bool Match(const std::string& s, + boost::match_flag_type flags = match_default) ; | +Attempts to match the current expression against the + text s using the match flags flags - see + match flags. Returns true if the expression matches the whole of + the input string. | ++ |
+ | bool Search(const char* p, + boost::match_flag_type flags = match_default); | +Attempts to find a match for the current expression + somewhere in the text p using the match flags flags - see + match flags. Returns true if the match succeeds. | ++ |
+ | bool Search(const std::string& s, + boost::match_flag_type flags = match_default) ; | +Attempts to find a match for the current expression + somewhere in the text s using the match flags flags - see + match flags. Returns true if the match succeeds. | ++ |
+ | unsigned int Grep(GrepCallback cb, const + char* p, boost::match_flag_type flags = match_default); | +Finds all matches of the current expression in the
+ text p using the match flags flags - see
+ match flags. For each match found calls the call-back function cb
+ as: cb(*this);
+ If at any stage the call-back function returns false then the grep operation + terminates, otherwise continues until no further matches are found. Returns the + number of matches found. + |
+ + |
+ | unsigned int Grep(GrepCallback cb, const + std::string& s, boost::match_flag_type flags = match_default); | +Finds all matches of the current expression in the
+ text s using the match flags flags - see
+ match flags. For each match found calls the call-back function cb
+ as: cb(*this);
+ If at any stage the call-back function returns false then the grep operation + terminates, otherwise continues until no further matches are found. Returns the + number of matches found. + |
+ + |
+ | unsigned int Grep(std::vector<std::string>& + v, const char* p, boost::match_flag_type flags = match_default); | +Finds all matches of the current expression in the + text p using the match flags flags - see + match flags. For each match pushes a copy of what matched onto v. + Returns the number of matches found. | ++ |
+ | unsigned int Grep(std::vector<std::string>& + v, const std::string& s, boost::match_flag_type flags = + match_default); | +Finds all matches of the current expression in the + text s using the match flags flags - see + match flags. For each match pushes a copy of what matched onto v. + Returns the number of matches found. | ++ |
+ | unsigned int Grep(std::vector<unsigned + int>& v, const char* p, boost::match_flag_type + flags = match_default); | +Finds all matches of the current expression in the + text p using the match flags flags - see + match flags. For each match pushes the starting index of what matched + onto v. Returns the number of matches found. | ++ |
+ | unsigned int Grep(std::vector<unsigned + int>& v, const std::string& s, boost::match_flag_type + flags = match_default); | +Finds all matches of the current expression in the + text s using the match flags flags - see + match flags. For each match pushes the starting index of what matched + onto v. Returns the number of matches found. | ++ |
+ | unsigned int GrepFiles(GrepFileCallback + cb, const char* files, bool recurse = false, + boost::match_flag_type flags = match_default); | +Finds all matches of the current expression in the
+ files files using the match flags flags - see
+ match flags. For each match calls the call-back function cb.
+ If the call-back returns false then the algorithm returns without considering + further matches in the current file, or any further files. +The parameter files can include wild card characters '*' and '?', if the + parameter recurse is true then searches sub-directories for matching + file names. +Returns the total number of matches found. +May throw an exception derived from std::runtime_error if file io fails. + |
+ + |
+ | unsigned int GrepFiles(GrepFileCallback + cb, const std::string& files, bool recurse = false, + boost::match_flag_type flags = match_default); | +Finds all matches of the current expression in the
+ files files using the match flags flags - see
+ match flags. For each match calls the call-back function cb.
+ If the call-back returns false then the algorithm returns without considering + further matches in the current file, or any further files. +The parameter files can include wild card characters '*' and '?', if the + parameter recurse is true then searches sub-directories for matching + file names. +Returns the total number of matches found. +May throw an exception derived from std::runtime_error if file io fails. + |
+ + |
+ | unsigned int FindFiles(FindFilesCallback + cb, const char* files, bool recurse = false, + boost::match_flag_type flags = match_default); | +Searches files to find all those which contain
+ at least one match of the current expression using the match flags flags
+ - see match flags. For each matching file
+ calls the call-back function cb.
+ If the call-back returns false then the algorithm returns without considering + any further files. +The parameter files can include wild card characters '*' and '?', if the + parameter recurse is true then searches sub-directories for matching + file names. +Returns the total number of files found. +May throw an exception derived from std::runtime_error if file io fails. + |
+ + |
+ | unsigned int FindFiles(FindFilesCallback + cb, const std::string& files, bool recurse = false, + boost::match_flag_type flags = match_default); | +Searches files to find all those which contain
+ at least one match of the current expression using the match flags flags
+ - see match flags. For each matching file
+ calls the call-back function cb.
+ If the call-back returns false then the algorithm returns without considering + any further files. +The parameter files can include wild card characters '*' and '?', if the + parameter recurse is true then searches sub-directories for matching + file names. +Returns the total number of files found. +May throw an exception derived from std::runtime_error if file io fails. + |
+ + |
+ | std::string Merge(const std::string& in, const + std::string& fmt, bool copy = true, boost::match_flag_type + flags = match_default); | +Performs a search and replace operation: searches + through the string in for all occurrences of the current expression, for + each occurrence replaces the match with the format string fmt. Uses flags + to determine what gets matched, and how the format string should be treated. If + copy is true then all unmatched sections of input are copied unchanged + to output, if the flag format_first_only is set then only the first + occurance of the pattern found is replaced. Returns the new string. See + also format string syntax, match flags + and format flags. | ++ |
+ | std::string Merge(const char* in, const + char* fmt, bool copy = true, boost::match_flag_type flags = + match_default); | +Performs a search and replace operation: searches + through the string in for all occurrences of the current expression, for + each occurrence replaces the match with the format string fmt. Uses flags + to determine what gets matched, and how the format string should be treated. If + copy is true then all unmatched sections of input are copied unchanged + to output, if the flag format_first_only is set then only the first + occurance of the pattern found is replaced. Returns the new string. See + also format string syntax, match flags + and format flags. | ++ |
+ | unsigned Split(std::vector<std::string>& v, + std::string& s, boost::match_flag_type flags = match_default, unsigned + max_count = ~0); | +Splits the input string and pushes each one onto the vector. If + the expression contains no marked sub-expressions, then one string is outputted + for each section of the input that does not match the expression. If the + expression does contain marked sub-expressions, then outputs one string for + each marked sub-expression each time a match occurs. Outputs no more than max_count + strings. Before returning, deletes from the input string s all of the + input that has been processed (all of the string if max_count was not + reached). Returns the number of strings pushed onto the vector. | ++ |
+ | unsigned int Position(int i = 0)const; | +Returns the position of what matched sub-expression i. + If i = 0 then returns the position of the whole match. Returns + RegEx::npos if the supplied index is invalid, or if the specified + sub-expression did not participate in the match. | ++ |
+ | unsigned int Length(int i = 0)const; | +Returns the length of what matched sub-expression i. + If i = 0 then returns the length of the whole match. Returns RegEx::npos + if the supplied index is invalid, or if the specified sub-expression did not + participate in the match. | ++ |
+ | bool Matched(int i = 0)const; | +Returns true if sub-expression i was matched, false otherwise. | ++ |
+ | unsigned int Line()const; | +Returns the line on which the match occurred, indexes + start from 1 not zero, if no match occurred then returns RegEx::npos. | ++ |
+ | unsigned int Marks() const; | +Returns the number of marked sub-expressions + contained in the expression. Note that this includes the whole match + (sub-expression zero), so the value returned is always >= 1. | ++ |
+ | std::string What(int i)const; | +Returns a copy of what matched sub-expression i. + If i = 0 then returns a copy of the whole match. Returns a null string + if the index is invalid or if the specified sub-expression did not participate + in a match. | ++ |
+ | std::string operator[](int i)const + ; | +Returns what(i);
+ Can be used to simplify access to sub-expression matches, and make usage more + perl-like. + |
+ + |
Revised + + 04 Feb 2004 +
+© Copyright John Maddock 1998- + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/regex_format.html b/doc/regex_format.html new file mode 100644 index 00000000..74f21e9c --- /dev/null +++ b/doc/regex_format.html @@ -0,0 +1,204 @@ + + + + +
+ |
+
+Boost.Regex+ +Algorithm regex_format (deprecated)+ |
+
+ |
+
The algorithm regex_format is deprecated; new code should use +match_results::format instead. Existing code will continue to +compile, the following documentation is taken from the previous +version of boost.regex and will not be further updated:
+ ++#include <boost/regex.hpp> ++ +
The algorithm regex_format takes the results of a match and +creates a new string based upon a +format string, regex_format can be used for search and replace +operations:
+ ++template <class OutputIterator, class iterator, class Allocator, class charT> +OutputIterator regex_format(OutputIterator out, + const match_results<iterator, Allocator>& m, + const charT* fmt, + match_flag_type flags = 0); +template <class OutputIterator, class iterator, class Allocator, class charT> +OutputIterator regex_format(OutputIterator out, + const match_results<iterator, Allocator>& m, + const std::basic_string<charT>& fmt, + match_flag_type flags = 0); ++ +
The library also defines the following convenience variation of +regex_format, which returns the result directly as a string, rather +than outputting to an iterator [note - this version may not be +available, or may be available in a more limited form, depending +upon your compilers capabilities]:
+ ++template <class iterator, class Allocator, class charT> +std::basic_string<charT> regex_format + (const match_results<iterator, Allocator>& m, + const charT* fmt, + match_flag_type flags = 0); + +template <class iterator, class Allocator, class charT> +std::basic_string<charT> regex_format + (const match_results<iterator, Allocator>& m, + const std::basic_string<charT>& fmt, + match_flag_type flags = 0); ++ +
Parameters to the main version of the function are passed as +follows:
+ + + ++ | OutputIterator out | +An output iterator type, the output +string is sent to this iterator. Typically this would be a +std::ostream_iterator. | ++ |
+ | const +match_results<iterator, Allocator>& m | +An instance of match_results<> +obtained from one of the matching algorithms above, and denoting +what matched. | ++ |
+ | const charT* fmt | +A format string that determines how +the match is transformed into the new string. | ++ |
+ | unsigned flags | +Optional flags which describe how the +format string is to be interpreted. | ++ |
Format flags are defined as +follows:
+ + + ++ | format_all | +Enables all syntax options (perl-like +plus extentions). | ++ |
+ | format_sed | +Allows only a sed-like syntax. | ++ |
+ | format_perl | +Allows only a perl-like syntax. | ++ |
+ | format_no_copy | +Disables copying of unmatched sections +to the output string during +regex_merge operations. | ++ |
+ | format_first_only | +When this flag is set only the first occurance will be replaced +(applies to regex_merge only). | ++ |
The format string syntax (and available options) is described +more fully under format strings +.
+ + + +Revised + + 24 Oct 2003 +
+© Copyright John Maddock 1998- + + 2003
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + + + diff --git a/doc/regex_grep.html b/doc/regex_grep.html new file mode 100644 index 00000000..ac1d804b --- /dev/null +++ b/doc/regex_grep.html @@ -0,0 +1,377 @@ + + + +
+ |
+
+ Boost.Regex+Algorithm regex_grep (deprecated)+ |
+
+ |
+
The algorithm regex_grep is deprecated in favor of regex_iterator + which provides a more convenient and standard library friendly interface.
+The following documentation is taken unchanged from the previous boost release, + and will not be updated in future.
++#include <boost/regex.hpp> ++
regex_grep allows you to search through a bidirectional-iterator range and + locate all the (non-overlapping) matches with a given regular expression. The + function is declared as:
++template <class Predicate, class iterator, class charT, class traits> +unsigned int regex_grep(Predicate foo, + iterator first, + iterator last, + const basic_regex<charT, traits>& e, + boost::match_flag_type flags = match_default) ++
The library also defines the following convenience versions, which take either + a const charT*, or a const std::basic_string<>& in place of a pair of + iterators [note - these versions may not be available, or may be available in a + more limited form, depending upon your compilers capabilities]:
++template <class Predicate, class charT, class traits> +unsigned int regex_grep(Predicate foo, + const charT* str, + const basic_regex<charT, traits>& e, + boost::match_flag_type flags = match_default); + +template <class Predicate, class ST, class SA, class charT, class traits> +unsigned int regex_grep(Predicate foo, + const std::basic_string<charT, ST, SA>& s, + const basic_regex<charT, traits>& e, + boost::match_flag_type flags = match_default); ++
The parameters for the primary version of regex_grep have the following + meanings:
+ ++ | foo | +A predicate function object or function pointer, see + below for more information. | ++ |
+ | first | +The start of the range to search. | ++ |
+ | last | +The end of the range to search. | ++ |
+ | e | +The regular expression to search for. | ++ |
+ | flags | +The flags that determine how matching is carried out, + one of the match_flags enumerators. | ++ |
The algorithm finds all of the non-overlapping matches of the expression e, for + each match it fills a match_results<iterator> + structure, which contains information on what matched, and calls the predicate + foo, passing the match_results<iterator> as a single argument. If the + predicate returns true, then the grep operation continues, otherwise it + terminates without searching for further matches. The function returns the + number of matches found.
+The general form of the predicate is:
++struct grep_predicate +{ + bool operator()(const match_results<iterator_type>& m); +}; ++
For example the regular expression "a*b" would find one match in the string + "aaaaab" and two in the string "aaabb".
+Remember this algorithm can be used for a lot more than implementing a version + of grep, the predicate can be and do anything that you want, grep utilities + would output the results to the screen, another program could index a file + based on a regular expression and store a set of bookmarks in a list, or a text + file conversion utility would output to file. The results of one regex_grep can + even be chained into another regex_grep to create recursive parsers.
+The algorithm may throw std::runtime_error
if the complexity
+ of matching the expression against an N character string begins to exceed O(N2),
+ or if the program runs out of stack space while matching the expression (if
+ Boost.regex is configured in recursive mode),
+ or if the matcher exhausts it's permitted memory allocation (if Boost.regex is
+ configured in non-recursive mode).
Example: convert + the example from regex_search to use regex_grep instead:
++#include <string> +#include <map> +#include <boost/regex.hpp> + +// IndexClasses: +// takes the contents of a file in the form of a string +// and searches for all the C++ class definitions, storing +// their locations in a map of strings/int's +typedef std::map<std::string, int, std::less<std::string> > map_type; + +const char* re = + // possibly leading whitespace: + "^[[:space:]]*" + // possible template declaration: + "(template[[:space:]]*<[^;:{]+>[[:space:]]*)?" + // class or struct: + "(class|struct)[[:space:]]*" + // leading declspec macros etc: + "(" + "\\<\\w+\\>" + "(" + "[[:blank:]]*\\([^)]*\\)" + ")?" + "[[:space:]]*" + ")*" + // the class name + "(\\<\\w*\\>)[[:space:]]*" + // template specialisation parameters + "(<[^;:{]+>)?[[:space:]]*" + // terminate in { or : + "(\\{|:[^;\\{()]*\\{)"; + +boost::regex expression(re); +class IndexClassesPred +{ + map_type& m; + std::string::const_iterator base; +public: + IndexClassesPred(map_type& a, std::string::const_iterator b) : m(a), base(b) {} + bool operator()(const smatch& what) + { + // what[0] contains the whole string + // what[5] contains the class name. + // what[6] contains the template specialisation if any. + // add class name and position to map: + m[std::string(what[5].first, what[5].second) + std::string(what[6].first, what[6].second)] = + what[5].first - base; + return true; + } +}; +void IndexClasses(map_type& m, const std::string& file) +{ + std::string::const_iterator start, end; + start = file.begin(); + end = file.end(); + regex_grep(IndexClassesPred(m, start), start, end, expression); +} ++
Example: Use + regex_grep to call a global callback function:
++#include <string> +#include <map> +#include <boost/regex.hpp> + +// purpose: +// takes the contents of a file in the form of a string +// and searches for all the C++ class definitions, storing +// their locations in a map of strings/int's +typedef std::map<std::string, int, std::less<std::string> > map_type; + +const char* re = + // possibly leading whitespace: + "^[[:space:]]*" + // possible template declaration: + "(template[[:space:]]*<[^;:{]+>[[:space:]]*)?" + // class or struct: + "(class|struct)[[:space:]]*" + // leading declspec macros etc: + "(" + "\\<\\w+\\>" + "(" + "[[:blank:]]*\\([^)]*\\)" + ")?" + "[[:space:]]*" + ")*" + // the class name + "(\\<\\w*\\>)[[:space:]]*" + // template specialisation parameters + "(<[^;:{]+>)?[[:space:]]*" + // terminate in { or : + "(\\{|:[^;\\{()]*\\{)"; + +boost::regex expression(re); +map_type class_index; +std::string::const_iterator base; + +bool grep_callback(const boost::smatch& what) +{ + // what[0] contains the whole string + // what[5] contains the class name. + // what[6] contains the template specialisation if any. + // add class name and position to map: + class_index[std::string(what[5].first, what[5].second) + std::string(what[6].first, what[6].second)] = + what[5].first - base; + return true; +} +void IndexClasses(const std::string& file) +{ + std::string::const_iterator start, end; + start = file.begin(); + end = file.end(); + base = start; + regex_grep(grep_callback, start, end, expression, match_default); +} + ++
Example: use + regex_grep to call a class member function, use the standard library adapters std::mem_fun + and std::bind1st to convert the member function into a predicate:
++#include <string> +#include <map> +#include <boost/regex.hpp> +#include <functional> +// purpose: +// takes the contents of a file in the form of a string +// and searches for all the C++ class definitions, storing +// their locations in a map of strings/int's + +typedef std::map<std::string, int, std::less<std::string> > map_type; +class class_index +{ + boost::regex expression; + map_type index; + std::string::const_iterator base; + bool grep_callback(boost::smatch what); +public: + void IndexClasses(const std::string& file); + class_index() + : index(), + expression("^(template[[:space:]]*<[^;:{]+>[[:space:]]*)?" + "(class|struct)[[:space:]]*(\\<\\w+\\>([[:blank:]]*\\([^)]*\\))?" + "[[:space:]]*)*(\\<\\w*\\>)[[:space:]]*(<[^;:{]+>[[:space:]]*)?" + "(\\{|:[^;\\{()]*\\{)" + ){} +}; +bool class_index::grep_callback(boost::smatch what) +{ + // what[0] contains the whole string + // what[5] contains the class name. + // what[6] contains the template specialisation if any. + // add class name and position to map: + index[std::string(what[5].first, what[5].second) + std::string(what[6].first, what[6].second)] = + what[5].first - base; + return true; +} + +void class_index::IndexClasses(const std::string& file) +{ + std::string::const_iterator start, end; + start = file.begin(); + end = file.end(); + base = start; + regex_grep(std::bind1st(std::mem_fun(&class_index::grep_callback), this), + start, + end, + expression); +} + ++
Finally, C++ + Builder users can use C++ Builder's closure type as a callback argument:
++#include <string> +#include <map> +#include <boost/regex.hpp> +#include <functional> +// purpose: +// takes the contents of a file in the form of a string +// and searches for all the C++ class definitions, storing +// their locations in a map of strings/int's + +typedef std::map<std::string, int, std::less<std::string> > map_type; +class class_index +{ + boost::regex expression; + map_type index; + std::string::const_iterator base; + typedef boost::smatch arg_type; + bool grep_callback(const arg_type& what); +public: + typedef bool (__closure* grep_callback_type)(const arg_type&); + void IndexClasses(const std::string& file); + class_index() + : index(), + expression("^(template[[:space:]]*<[^;:{]+>[[:space:]]*)?" + "(class|struct)[[:space:]]*(\\<\\w+\\>([[:blank:]]*\\([^)]*\\))?" + "[[:space:]]*)*(\\<\\w*\\>)[[:space:]]*(<[^;:{]+>[[:space:]]*)?" + "(\\{|:[^;\\{()]*\\{)" + ){} +}; + +bool class_index::grep_callback(const arg_type& what) +{ + // what[0] contains the whole string +// what[5] contains the class name. +// what[6] contains the template specialisation if any. +// add class name and position to map: +index[std::string(what[5].first, what[5].second) + std::string(what[6].first, what[6].second)] = + what[5].first - base; + return true; +} + +void class_index::IndexClasses(const std::string& file) +{ + std::string::const_iterator start, end; + start = file.begin(); + end = file.end(); + base = start; + class_index::grep_callback_type cl = &(this->grep_callback); + regex_grep(cl, + start, + end, + expression); +} ++ +
Revised + + 26 June 2004 +
+© Copyright John Maddock 1998- + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/regex_iterator.html b/doc/regex_iterator.html new file mode 100644 index 00000000..f2e647f5 --- /dev/null +++ b/doc/regex_iterator.html @@ -0,0 +1,456 @@ + + + +
+ |
+
+ Boost.Regex+regex_iterator+ |
+
+ |
+
The iterator type regex_iterator will enumerate all of the regular expression + matches found in some sequence: dereferencing a regex_iterator yields a + reference to a match_results object.
++template <class BidirectionalIterator, + class charT = iterator_traits<BidirectionalIterator>::value_type, + class traits = regex_traits<charT> > +class regex_iterator +{ +public: + typedef basic_regex<charT, traits> regex_type; + typedef match_results<BidirectionalIterator> value_type; + typedef typename iterator_traits<BidirectionalIterator>::difference_type difference_type; + typedef const value_type* pointer; + typedef const value_type& reference; + typedef std::forward_iterator_tag iterator_category; + + regex_iterator(); + regex_iterator(BidirectionalIterator a, BidirectionalIterator b, + const regex_type& re, + match_flag_type m = match_default); + regex_iterator(const regex_iterator&); + regex_iterator& operator=(const regex_iterator&); + bool operator==(const regex_iterator&)const; + bool operator!=(const regex_iterator&)const; + const value_type& operator*()const; + const value_type* operator->()const; + regex_iterator& operator++(); + regex_iterator operator++(int); +}; + +typedef +regex_iterator<const + +char*> cregex_iterator; typedef regex_iterator<std::string::const_iterator> +sregex_iterator; #ifndef BOOST_NO_WREGEX +typedef regex_iterator<const +wchar_t*> wcregex_iterator; typedef regex_iterator<std::wstring::const_iterator> +wsregex_iterator; #endif template +<class + +charT, class traits> regex_iterator<const charT*, +charT, traits> + make_regex_iterator(const charT* p, const basic_regex<charT, traits>& e, regex_constants::match_flag_type m = regex_constants::match_default); template <class + +charT, class traits, class ST, class SA> regex_iterator<typename std::basic_string<charT, +ST, SA>::const_iterator, charT, traits> + make_regex_iterator(const std::basic_string<charT, ST, SA>& p, const basic_regex<charT, traits>& e, regex_constants::match_flag_type m = regex_constants::match_default); + ++
A regex_iterator is constructed from a pair of iterators, and enumerates all + occurrences of a regular expression within that iterator range.
++regex_iterator(); ++ +
Effects: constructs an end of sequence regex_iterator.
+regex_iterator(BidirectionalIterator a, BidirectionalIterator b, + const regex_type& re, + match_flag_type m = match_default); ++ +
Effects: constructs a regex_iterator that will enumerate all occurrences + of the expression re, within the sequence [a,b), and found + using match flags m. The object re must exist for the + lifetime of the regex_iterator.
+Throws: std::runtime_error
if the complexity of
+ matching the expression against an N character string begins to exceed O(N2),
+ or if the program runs out of stack space while matching the expression (if
+ Boost.regex is configured in recursive mode),
+ or if the matcher exhausts it's permitted memory allocation (if Boost.regex is
+ configured in non-recursive mode).
+regex_iterator(const regex_iterator& that); ++ +
Effects: constructs a copy of that
.
Postconditions: *this == that
.
+regex_iterator& operator=(const regex_iterator&); ++ +
Effects: sets *this
equal to those in that
.
Postconditions: *this == that
.
+bool operator==(const regex_iterator& that)const; ++ +
Effects: returns true if *this is equal to that.
++bool operator!=(const regex_iterator&)const; ++ +
Effects: returns !(*this == that)
.
+const value_type& operator*()const; ++
Effects: dereferencing a regex_iterator object it yields a + const reference to a match_results object, + whose members are set as follows:
+ +
+ Element + |
+
+ Value + |
+
+ (*it).size() + |
+
+ re.mark_count() + |
+
+ (*it).empty() + |
+
+ false + |
+
+ (*it).prefix().first + |
+
+ The end of the last match found, or the start of the underlying sequence if + this is the first match enumerated + |
+
+ (*it).prefix().last + |
+
+ The same as the start of the match found: |
+
+ (*it).prefix().matched + |
+
+ True if the prefix did not match an empty string: |
+
+ (*it).suffix().first + |
+
+ The same as the end of the match found: |
+
+ (*it).suffix().last + |
+
+ The end of the underlying sequence. + |
+
+ (*it).suffix().matched + |
+
+ True if the suffix did not match an empty string: |
+
+ (*it)[0].first + |
+
+ The start of the sequence of characters that matched the regular expression + |
+
+ (*it)[0].second + |
+
+ The end of the sequence of characters that matched the regular expression + |
+
+ (*it)[0].matched + |
+
+
|
+
+ (*it)[n].first + |
+
+ For all integers n < (*it).size(), the start of the sequence that matched + sub-expression n. Alternatively, if sub-expression n did not participate + in the match, then last. + |
+
+ (*it)[n].second + |
+
+ For all integers n < (*it).size(), the end of the sequence that matched + sub-expression n. Alternatively, if sub-expression n did not participate + in the match, then last. + |
+
+ (*it)[n].matched + |
+
+ For all integers n < (*it).size(), true if sub-expression n participated + in the match, false otherwise. + |
+
(*it).position(n) | +For all integers n < (*it).size(), then the + distance from the start of the underlying sequence to the start of + sub-expression match n. | +
+const value_type* operator->()const; ++ +
Effects: returns &(*this)
.
+regex_iterator& operator++(); ++
Effects: moves the iterator to the next match in the + underlying sequence, or the end of sequence iterator if none if found. + When the last match found matched a zero length string, then the + regex_iterator will find the next match as follows: if there exists a non-zero + length match that starts at the same location as the last one, then returns it, + otherwise starts looking for the next (possibly zero length) match from one + position to the right of the last match.
+Throws: std::runtime_error
if the complexity of
+ matching the expression against an N character string begins to exceed O(N2),
+ or if the program runs out of stack space while matching the expression (if
+ Boost.regex is configured in recursive mode),
+ or if the matcher exhausts it's permitted memory allocation (if Boost.regex is
+ configured in non-recursive mode).
Returns: *this
.
+regex_iterator operator++(int); ++ +
Effects: constructs a copy result
of *this
,
+ then calls ++(*this)
.
Returns: result
.
template <class charT, class traits> regex_iterator<const charT*, charT, traits> +make_regex_iterator(const charT* + p, const basic_regex<charT, + traits>& e, regex_constants::match_flag_type m + = regex_constants::match_default); template <class + +charT, class traits, class ST, class SA> regex_iterator<typename std::basic_string<charT, +ST, SA>::const_iterator, charT, traits> + make_regex_iterator(const std::basic_string<charT, ST, SA>& p, + const basic_regex<charT, traits>& e, + regex_constants::match_flag_type m = regex_constants::match_default); ++
Effects: returns an iterator that enumerates all occurences of + expression e in text p using match_flags m.
+The following example + takes a C++ source file and builds up an index of class names, and the location + of that class in the file.
++#include <string> +#include <map> +#include <fstream> +#include <iostream> +#include <boost/regex.hpp> + +using namespace std; + +// purpose: +// takes the contents of a file in the form of a string +// and searches for all the C++ class definitions, storing +// their locations in a map of strings/int's + +typedef std::map<std::string, std::string::difference_type, std::less<std::string> > map_type; + +const char* re = + // possibly leading whitespace: + "^[[:space:]]*" + // possible template declaration: + "(template[[:space:]]*<[^;:{]+>[[:space:]]*)?" + // class or struct: + "(class|struct)[[:space:]]*" + // leading declspec macros etc: + "(" + "\\<\\w+\\>" + "(" + "[[:blank:]]*\\([^)]*\\)" + ")?" + "[[:space:]]*" + ")*" + // the class name + "(\\<\\w*\\>)[[:space:]]*" + // template specialisation parameters + "(<[^;:{]+>)?[[:space:]]*" + // terminate in { or : + "(\\{|:[^;\\{()]*\\{)"; + + +boost::regex expression(re); +map_type class_index; + +bool regex_callback(const boost::match_results<std::string::const_iterator>& what) +{ + // what[0] contains the whole string + // what[5] contains the class name. + // what[6] contains the template specialisation if any. + // add class name and position to map: + class_index[what[5].str() + what[6].str()] = what.position(5); + return true; +} + +void load_file(std::string& s, std::istream& is) +{ + s.erase(); + s.reserve(is.rdbuf()->in_avail()); + char c; + while(is.get(c)) + { + if(s.capacity() == s.size()) + s.reserve(s.capacity() * 3); + s.append(1, c); + } +} + +int main(int argc, const char** argv) +{ + std::string text; + for(int i = 1; i < argc; ++i) + { + cout << "Processing file " << argv[i] << endl; + std::ifstream fs(argv[i]); + load_file(text, fs); + // construct our iterators: + boost::sregex_iterator m1(text.begin(), text.end(), expression); + boost::sregex_iterator m2; + std::for_each(m1, m2, ®ex_callback); + // copy results: + cout << class_index.size() << " matches found" << endl; + map_type::iterator c, d; + c = class_index.begin(); + d = class_index.end(); + while(c != d) + { + cout << "class \"" << (*c).first << "\" found at index: " << (*c).second << endl; + ++c; + } + class_index.erase(class_index.begin(), class_index.end()); + } + return 0; +} ++
Revised + + 06 Jan 05 +
+© Copyright John Maddock 1998- + 2005
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/regex_match.html b/doc/regex_match.html new file mode 100644 index 00000000..31c5eba6 --- /dev/null +++ b/doc/regex_match.html @@ -0,0 +1,318 @@ + + + ++
+ |
+
+ Boost.Regex+Algorithm regex_match+ |
+
+ |
+
#include <boost/regex.hpp>+
+ The algorithm regex _match determines whether a given regular expression + matches all of a given character sequence denoted by a pair of + bidirectional-iterators, the algorithm is defined as follows, the main use of + this function is data input validation. +
Note that the result is true only if the expression matches the whole of + the input sequence. If you want to search for an expression + somewhere within the sequence then use regex_search. + If you want to match a prefix of the character string then use + regex_search with the flag match_continuous + set. +
+template <class BidirectionalIterator, class Allocator, class charT, class traits> +bool regex_match(BidirectionalIterator first, BidirectionalIterator last, + match_results<BidirectionalIterator, Allocator>& m, + const basic_regex <charT, traits>& e, + match_flag_type flags = match_default); + +template <class BidirectionalIterator, class charT, class traits> +bool regex_match(BidirectionalIterator first, BidirectionalIterator last, + const basic_regex <charT, traits>& e, + match_flag_type flags = match_default); + +template <class charT, class Allocator, class traits> +bool regex_match(const charT* str, match_results<const charT*, Allocator>& m, + const basic_regex <charT, traits>& e, + match_flag_type flags = match_default); + +template <class ST, class SA, class Allocator, class charT, class traits> +bool regex_match(const basic_string<charT, ST, SA>& s, + match_results<typename basic_string<charT, ST, SA>::const_iterator, Allocator>& m, + const basic_regex <charT, traits>& e, + match_flag_type flags = match_default); + +template <class charT, class traits> +bool regex_match(const charT* str, + const basic_regex <charT, traits>& e, + match_flag_type flags = match_default); + +template <class ST, class SA, class charT, class traits> +bool regex_match(const basic_string<charT, ST, SA>& s, + const basic_regex <charT, traits>& e, + match_flag_type flags = match_default); ++
template <class BidirectionalIterator, class Allocator, class charT, class traits> +bool regex_match(BidirectionalIterator first, BidirectionalIterator last, + match_results<BidirectionalIterator, Allocator>& m, + const basic_regex <charT, traits>& e, + match_flag_type flags = match_default);+
Requires: Type BidirectionalIterator meets the requirements of a + Bidirectional Iterator (24.1.4).
+Effects: Determines whether there is an exact match between the regular + expression e, and all of the character sequence [first, last), parameter + flags is used to control how the expression + is matched against the character sequence. Returns true if such a match + exists, false otherwise.
+Throws: std::runtime_error
if the complexity of
+ matching the expression against an N character string begins to exceed O(N2),
+ or if the program runs out of stack space while matching the expression (if
+ Boost.regex is configured in recursive mode),
+ or if the matcher exhausts it's permitted memory allocation (if Boost.regex is
+ configured in non-recursive mode).
Postconditions: If the function returns false, then the effect on + parameter m is undefined, otherwise the effects on parameter m are + given in the table:
++
+ Element + + |
+
+ Value + + |
+
+ m.size() + |
+
+ e.mark_count() + |
+
+ m.empty() + |
+
+ false + |
+
+ m.prefix().first + |
+
+ first + |
+
+ m.prefix().last + |
+
+ first + |
+
+ m.prefix().matched + |
+
+ false + |
+
+ m.suffix().first + |
+
+ last + |
+
+ m.suffix().last + |
+
+ last + |
+
+ m.suffix().matched + |
+
+ false + |
+
+ m[0].first + |
+
+ first + |
+
+ m[0].second + |
+
+ last + |
+
+ m[0].matched + |
+
+
|
+
+ m[n].first + |
+
+ For all integers n < m.size(), the start of the sequence that matched + sub-expression n. Alternatively, if sub-expression n did not participate + in the match, then last. + |
+
+ m[n].second + |
+
+ For all integers n < m.size(), the end of the sequence that matched + sub-expression n. Alternatively, if sub-expression n did not participate + in the match, then last. + |
+
+ m[n].matched + |
+
+ For all integers n < m.size(), true if sub-expression n participated + in the match, false otherwise. + |
+
+
template <class BidirectionalIterator, class charT, class traits> +bool regex_match(BidirectionalIterator first, BidirectionalIterator last, + const basic_regex <charT, traits>& e, + match_flag_type flags = match_default);+
Effects: Behaves "as if" by constructing an instance of
+ match_results<
BidirectionalIterator> what
,
+ and then returning the result of regex_match(first, last, what, e, flags)
.
template <class charT, class Allocator, class traits> +bool regex_match(const charT* str, match_results<const charT*, Allocator>& m, + const basic_regex <charT, traits>& e, + match_flag_type flags = match_default);+
Effects: Returns the result of regex_match(str, str +
+ char_traits<charT>::length(str), m, e, flags)
.
template <class ST, class SA, class Allocator, + class charT, class traits> +bool regex_match(const basic_string<charT, ST, SA>& s, + match_results<typename basic_string<charT, ST, SA>::const_iterator, Allocator>& m, + const basic_regex <charT, traits>& e, + match_flag_type flags = match_default);+
Effects: Returns the result of regex_match(s.begin(), s.end(), m, e,
+ flags)
.
template <class charT, class traits> +bool regex_match(const charT* str, + const basic_regex <charT, traits>& e, + match_flag_type flags = match_default);+
Effects: Returns the result of regex_match(str, str +
+ char_traits<charT>::length(str), e, flags)
.
template <class ST, class SA, class charT, class traits> +bool regex_match(const basic_string<charT, ST, SA>& s, + const basic_regex <charT, traits>& e, + match_flag_type flags = match_default);+
Effects: Returns the result of regex_match(s.begin(), s.end(), e,
+ flags)
.
+
The following example + processes an ftp response: +
+#include <stdlib.h> +#include <boost/regex.hpp> +#include <string> +#include <iostream> + +using namespace boost; + +regex expression("([0-9]+)(\\-| |$)(.*)"); + +// process_ftp: +// on success returns the ftp response code, and fills +// msg with the ftp response message. +int process_ftp(const char* response, std::string* msg) +{ + cmatch what; + if(regex_match(response, what, expression)) + { + // what[0] contains the whole string + // what[1] contains the response code + // what[2] contains the separator character + // what[3] contains the text message. + if(msg) + msg->assign(what[3].first, what[3].second); + return std::atoi(what[1].first); + } + // failure did not match + if(msg) + msg->erase(); + return -1; +} ++ ++
Revised + + 26 June 2004 +
+© Copyright John Maddock 1998- + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/regex_merge.html b/doc/regex_merge.html new file mode 100644 index 00000000..bbfcc23b --- /dev/null +++ b/doc/regex_merge.html @@ -0,0 +1,45 @@ + + + ++
+ |
+
+ Boost.Regex+Algorithm regex_merge (deprecated)+ |
+
+ |
+
Algorithm regex_merge has been renamed regex_replace, + existing code will continue to compile, but new code should use + regex_replace instead.
++
Revised + + 24 Oct 2003 +
+© Copyright John Maddock 1998- + + 2003
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + + diff --git a/doc/regex_replace.html b/doc/regex_replace.html new file mode 100644 index 00000000..2005ccb0 --- /dev/null +++ b/doc/regex_replace.html @@ -0,0 +1,256 @@ + + + ++
+ |
+
+ Boost.Regex+Algorithm regex_replace+ |
+
+ |
+
#include <boost/regex.hpp>+
The algorithm regex_replace searches through a string finding + all the matches to the regular expression: for each match it then calls + match_results::format to format the string and sends the result to the + output iterator. Sections of text that do not match are copied to the output + unchanged only if the flags parameter does not have the flag + format_no_copy set. If the flag format_first_only + is set then only the first occurrence is replaced rather than all + occurrences.
template <class OutputIterator, class BidirectionalIterator, class traits, class charT> +OutputIterator regex_replace(OutputIterator out, + BidirectionalIterator first, + BidirectionalIterator last, + const basic_regex<charT, traits>& e, + const basic_string<charT>& fmt, + match_flag_type flags = match_default); + +template <class traits, class charT> +basic_string<charT> regex_replace(const basic_string<charT>& s, + const basic_regex<charT, traits>& e, + const basic_string<charT>& fmt, + match_flag_type flags = match_default); + ++
template <class OutputIterator, class BidirectionalIterator, class traits, class charT> +OutputIterator regex_replace(OutputIterator out, + BidirectionalIterator first, + BidirectionalIterator last, + const basic_regex<charT, traits>& e, + const basic_string<charT>& fmt, + match_flag_type flags = match_default);+
Enumerates all the occurences of expression e in the sequence [first, + last), replacing each occurence with the string that results by merging the + match found with the format string fmt, and copies the resulting + string to out.
+If the flag format_no_copy is set in flags then unmatched sections of + text are not copied to output. +
+If the flag format_first_only is set in flags then only the first + occurence of e is replaced. +
+The manner in which the format string fmt is interpretted, along with + the rules used for finding matches, are determined by the + flags set in flags
+Effects: Constructs an + regex_iterator + object: +
+regex_iterator<BidirectionalIterator, charT, traits, Allocator>+
i(first, last, e, flags),
and uses + + i + to enumerate through all of the matches m of type + + match_results + <BidirectionalIterator> that + occur within the sequence [first, last). +
+If no such matches are found + and
+!(flags & format_no_copy)+
then calls +
+std::copy(first, last, out).+
Otherwise, for each match found, + if
+!(flags & format_no_copy)+
calls +
+std::copy(m.prefix().first, m.prefix().last, out),+
and then calls +
+m.format(out, fmt, flags).+
Finally + if
+!(flags & format_no_copy)+
calls +
+std::copy(last_m.suffix().first, last_m,suffix().last, out)+
where + + last_m + + is a copy of the last match found. +
+If + flags & +format_first_only + is non-zero then only the first match found is replaced.
+Throws: std::runtime_error
if the complexity of
+ matching the expression against an N character string begins to exceed O(N2),
+ or if the program runs out of stack space while matching the expression (if
+ Boost.regex is configured in recursive mode),
+ or if the matcher exhausts it's permitted memory allocation (if Boost.regex is
+ configured in non-recursive mode).
Returns: out
.
+
template <class traits, class charT> +basic_string<charT> regex_replace(const basic_string<charT>& s, + const basic_regex<charT, traits>& e, + const basic_string<charT>& fmt, + match_flag_type flags = match_default);+
Effects: Constructs an object basic_string<charT> result
,
+ calls regex_replace(back_inserter(result), s.begin(), s.end(), e, fmt,
+ flags)
, and then returns result
.
+
The following example + takes C/C++ source code as input, and outputs syntax highlighted HTML code.
+ +#include <fstream> +#include <sstream> +#include <string> +#include <iterator> +#include <boost/regex.hpp> +#include <fstream> +#include <iostream> + +// purpose: +// takes the contents of a file and transform to +// syntax highlighted code in html format + +boost::regex e1, e2; +extern const char* expression_text; +extern const char* format_string; +extern const char* pre_expression; +extern const char* pre_format; +extern const char* header_text; +extern const char* footer_text; + +void load_file(std::string& s, std::istream& is) +{ + s.erase(); + s.reserve(is.rdbuf()->in_avail()); + char c; + while(is.get(c)) + { + if(s.capacity() == s.size()) + s.reserve(s.capacity() * 3); + s.append(1, c); + } +} + +int main(int argc, const char** argv) +{ + try{ + e1.assign(expression_text); + e2.assign(pre_expression); + for(int i = 1; i < argc; ++i) + { + std::cout << "Processing file " << argv[i] << std::endl; + std::ifstream fs(argv[i]); + std::string in; + load_file(in, fs); + std::string out_name(std::string(argv[i]) + std::string(".htm")); + std::ofstream os(out_name.c_str()); + os << header_text; + // strip '<' and '>' first by outputting to a + // temporary string stream + std::ostringstream t(std::ios::out | std::ios::binary); + std::ostream_iterator<char, char> oi(t); + boost::regex_replace(oi, in.begin(), in.end(), + e2, pre_format, boost::match_default | boost::format_all); + // then output to final output stream + // adding syntax highlighting: + std::string s(t.str()); + std::ostream_iterator<char, char> out(os); + boost::regex_replace(out, s.begin(), s.end(), + e1, format_string, boost::match_default | boost::format_all); + os << footer_text; + } + } + catch(...) + { return -1; } + return 0; +} + +extern const char* pre_expression = "(<)|(>)|(&)|\\r"; +extern const char* pre_format = "(?1<)(?2>)(?3&)"; + + +const char* expression_text = // preprocessor directives: index 1 + "(^[[:blank:]]*#(?:[^\\\\\\n]|\\\\[^\\n[:punct:][:word:]]*[\\n[:punct:][:word:]])*)|" + // comment: index 2 + "(//[^\\n]*|/\\*.*?\\*/)|" + // literals: index 3 + "\\<([+-]?(?:(?:0x[[:xdigit:]]+)|(?:(?:[[:digit:]]*\\.)?[[:digit:]]+(?:[eE][+-]?[[:digit:]]+)?))u?(?:(?:int(?:8|16|32|64))|L)?)\\>|" + // string literals: index 4 + "('(?:[^\\\\']|\\\\.)*'|\"(?:[^\\\\\"]|\\\\.)*\")|" + // keywords: index 5 + "\\<(__asm|__cdecl|__declspec|__export|__far16|__fastcall|__fortran|__import" + "|__pascal|__rtti|__stdcall|_asm|_cdecl|__except|_export|_far16|_fastcall" + "|__finally|_fortran|_import|_pascal|_stdcall|__thread|__try|asm|auto|bool" + "|break|case|catch|cdecl|char|class|const|const_cast|continue|default|delete" + "|do|double|dynamic_cast|else|enum|explicit|extern|false|float|for|friend|goto" + "|if|inline|int|long|mutable|namespace|new|operator|pascal|private|protected" + "|public|register|reinterpret_cast|return|short|signed|sizeof|static|static_cast" + "|struct|switch|template|this|throw|true|try|typedef|typeid|typename|union|unsigned" + "|using|virtual|void|volatile|wchar_t|while)\\>" + ; + +const char* format_string = "(?1<font color=\"#008040\">$&</font>)" + "(?2<I><font color=\"#000080\">$&</font></I>)" + "(?3<font color=\"#0000A0\">$&</font>)" + "(?4<font color=\"#0000FF\">$&</font>)" + "(?5<B>$&</B>)"; + +const char* header_text = "<HTML>\n<HEAD>\n" + "<TITLE>Auto-generated html formated source</TITLE>\n" + "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=windows-1252\">\n" + "</HEAD>\n" + "<BODY LINK=\"#0000ff\" VLINK=\"#800080\" BGCOLOR=\"#ffffff\">\n" + "<P> </P>\n<PRE>"; + +const char* footer_text = "</PRE>\n</BODY>\n\n"; ++
Revised + + 26 June 2004 +
+© Copyright John Maddock 1998- + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/regex_search.html b/doc/regex_search.html new file mode 100644 index 00000000..22691687 --- /dev/null +++ b/doc/regex_search.html @@ -0,0 +1,315 @@ + + + ++
+ |
+
+ Boost.Regex+Algorithm regex_search+ |
+
+ |
+
#include <boost/regex.hpp>+ +
The algorithm regex_search will search a range denoted by a pair of + bidirectional-iterators for a given regular expression. The algorithm uses + various heuristics to reduce the search time by only checking for a match if a + match could conceivably start at that position. The algorithm is defined as + follows: +
template <class BidirectionalIterator, + class Allocator, class charT, class traits> +bool regex_search(BidirectionalIterator first, BidirectionalIterator last, + match_results<BidirectionalIterator, Allocator>& m, + const basic_regex<charT, traits>& e, + match_flag_type flags = match_default); + +template <class ST, class SA, + class Allocator, class charT, class traits> +bool regex_search(const basic_string<charT, ST, SA>& s, + match_results< + typename basic_string<charT, ST,SA>::const_iterator, + Allocator>& m, + const basic_regex<charT, traits>& e, + match_flag_type flags = match_default); + +template<class charT, class Allocator, class traits> +bool regex_search(const charT* str, + match_results<const charT*, Allocator>& m, + const basic_regex<charT, traits>& e, + match_flag_type flags = match_default); + +template <class BidirectionalIterator, class charT, class traits> +bool regex_search(BidirectionalIterator first, BidirectionalIterator last, + const basic_regex<charT, traits>& e, + match_flag_type flags = match_default); + +template <class charT, class traits> +bool regex_search(const charT* str, + const basic_regex<charT, traits>& e, + match_flag_type flags = match_default); + +template<class ST, class SA, class charT, class traits> +bool regex_search(const basic_string<charT, ST, SA>& s, + const basic_regex<charT, traits>& e, + match_flag_type flags = match_default); ++
template <class BidirectionalIterator, class Allocator, class charT, class traits> +bool regex_search(BidirectionalIterator first, BidirectionalIterator last, + match_results<BidirectionalIterator, Allocator>& m, + const basic_regex<charT, traits>& e, + match_flag_type flags = match_default);+
Requires: Type BidirectionalIterator meets the requirements of a + Bidirectional Iterator (24.1.4).
+Effects: Determines whether there is some sub-sequence within + [first,last) that matches the regular expression e, parameter flags + is used to control how the expression is matched against the character + sequence. Returns true if such a sequence exists, false otherwise.
+Throws: std::runtime_error
if the complexity of
+ matching the expression against an N character string begins to exceed O(N2),
+ or if the program runs out of stack space while matching the expression (if
+ Boost.regex is configured in recursive mode),
+ or if the matcher exhausts it's permitted memory allocation (if Boost.regex is
+ configured in non-recursive mode).
Postconditions: If the function returns false, then the effect on + parameter m is undefined, otherwise the effects on parameter m are + given in the table:
+
+ Element + |
+
+ Value + + |
+
+ m.size() + |
+
+ e.mark_count() + |
+
+ m.empty() + |
+
+ false + |
+
+ m.prefix().first + |
+
+ first + |
+
+ m.prefix().last + |
+
+ m[0].first + |
+
+ m.prefix().matched + |
+
+ m.prefix().first != m.prefix().second + |
+
+ m.suffix().first + |
+
+ m[0].second + |
+
+ m.suffix().last + |
+
+ last + |
+
+ m.suffix().matched + |
+
+ m.suffix().first != m.suffix().second + |
+
+ m[0].first + |
+
+ The start of the sequence of characters that matched the regular expression + |
+
+ m[0].second + |
+
+ The end of the sequence of characters that matched the regular expression + |
+
+ m[0].matched + |
+
+
|
+
+ m[n].first + |
+
+ For all integers n < m.size(), the start of the sequence that matched + sub-expression n. Alternatively, if sub-expression n did not participate + in the match, then last. + |
+
+ m[n].second + |
+
+ For all integers n < m.size(), the end of the sequence that matched + sub-expression n. Alternatively, if sub-expression n did not participate + in the match, then last. + |
+
+ m[n].matched + |
+
+ For all integers n < m.size(), true if sub-expression n participated + in the match, false otherwise. + |
+
template <class charT, class Allocator, class traits> +bool regex_search(const charT* str, match_results<const charT*, Allocator>& m, + const basic_regex<charT, traits>& e, + match_flag_type flags = match_default);+
Effects: Returns the result of regex_search(str, str +
+ char_traits<charT>::length(str), m, e, flags)
.
template <class ST, class SA, class Allocator, class charT, + class traits> +bool regex_search(const basic_string<charT, ST, SA>& s, + match_results<typename basic_string<charT, ST, SA>::const_iterator, Allocator>& m, + const basic_regex<charT, traits>& e, + match_flag_type flags = match_default);+
Effects: Returns the result of regex_search(s.begin(), s.end(), m,
+ e, flags)
.
template <class iterator, class charT, class traits> +bool regex_search(iterator first, iterator last, + const basic_regex<charT, traits>& e, + match_flag_type flags = match_default);+
Effects: Behaves "as if" by constructing an instance of
+ match_results<
BidirectionalIterator> what
,
+ and then returning the result of regex_search(first, last, what, e, flags)
.
template <class charT, class traits> +bool regex_search(const charT* str + const basic_regex<charT, traits>& e, + match_flag_type flags = match_default);+
Effects: Returns the result of regex_search(str, str +
+ char_traits<charT>::length(str), e, flags)
.
template <class ST, class SA, class charT, class traits> +bool regex_search(const basic_string<charT, ST, SA>& s, + const basic_regex<charT, traits>& e, + match_flag_type flags = match_default);+
Effects: Returns the result of regex_search(s.begin(), s.end(), e,
+ flags)
.
+
The following example, + takes the contents of a file in the form of a string, and searches for all the + C++ class declarations in the file. The code will work regardless of the way + that std::string is implemented, for example it could easily be modified to + work with the SGI rope class, which uses a non-contiguous storage strategy.
+ +#include <string> +#include <map> +#include <boost/regex.hpp> + +// purpose: +// takes the contents of a file in the form of a string +// and searches for all the C++ class definitions, storing +// their locations in a map of strings/int's +typedef std::map<std::string, int, std::less<std::string> > map_type; + +boost::regex expression("^(template[[:space:]]*<[^;:{]+>[[:space:]]*)?(class|struct)[[:space:]]*(\\<\\w+\\>([[:blank:]]*\\([^)]*\\))?[[:space:]]*)*(\\<\\w*\\>)[[:space:]]*(<[^;:{]+>[[:space:]]*)?(\\{|:[^;\\{()]*\\{)"); + +void IndexClasses(map_type& m, const std::string& file) +{ + std::string::const_iterator start, end; + start = file.begin(); + end = file.end(); + boost::match_results<std::string::const_iterator> what; + boost::match_flag_type flags = boost::match_default; + while(regex_search(start, end, what, expression, flags)) + { + // what[0] contains the whole string + // what[5] contains the class name. + // what[6] contains the template specialisation if any. + // add class name and position to map: + m[std::string(what[5].first, what[5].second) + std::string(what[6].first, what[6].second)] = + what[5].first - file.begin(); + // update search position: + start = what[0].second; + // update flags: + flags |= boost::match_prev_avail; + flags |= boost::match_not_bob; + } +} ++
Revised + + 23 June 2004 +
+© Copyright John Maddock 1998- + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/regex_split.html b/doc/regex_split.html new file mode 100644 index 00000000..a3b7b293 --- /dev/null +++ b/doc/regex_split.html @@ -0,0 +1,145 @@ + + + ++
+ |
+
+ Boost.Regex+Algorithm regex_split (deprecated)+ |
+
+ |
+
The algorithm regex_split has been deprecated in favor of the iterator + regex_token_iterator which has a more flexible and powerful interface, + as well as following the more usual standard library "pull" rather than "push" + semantics.
+Code which uses regex_split will continue to compile, the following + documentation is taken from the previous boost.regex version:
+#include <boost/regex.hpp>+
Algorithm regex_split performs a similar operation to the perl split operation, + and comes in three overloaded forms: +
+template <class OutputIterator, class charT, class Traits1, class Alloc1, class Traits2> +std::size_t regex_split(OutputIterator out, + std::basic_string<charT, Traits1, Alloc1>& s, + const basic_regex<charT, Traits2>& e, + boost::match_flag_type flags, + std::size_t max_split); + +template <class OutputIterator, class charT, class Traits1, class Alloc1, class Traits2> +std::size_t regex_split(OutputIterator out, + std::basic_string<charT, Traits1, Alloc1>& s, + const basic_regex<charT, Traits2>& e, + boost::match_flag_type flags = match_default); + +template <class OutputIterator, class charT, class Traits1, class Alloc1> +std::size_t regex_split(OutputIterator out, + std::basic_string<charT, Traits1, Alloc1>& s);+
Effects: Each version of the algorithm takes an + output-iterator for output, and a string for input. If the expression contains + no marked sub-expressions, then the algorithm writes one string onto the + output-iterator for each section of input that does not match the expression. + If the expression does contain marked sub-expressions, then each time a match + is found, one string for each marked sub-expression will be written to the + output-iterator. No more than max_split strings will be written to the + output-iterator. Before returning, all the input processed will be deleted from + the string s (if max_split is not reached then all of s will + be deleted). Returns the number of strings written to the output-iterator. If + the parameter max_split is not specified then it defaults to UINT_MAX. + If no expression is specified, then it defaults to "\s+", and splitting occurs + on whitespace. +
+Throws: std::runtime_error
if the complexity of
+ matching the expression against an N character string begins to exceed O(N2),
+ or if the program runs out of stack space while matching the expression (if
+ Boost.regex is configured in recursive mode),
+ or if the matcher exhausts it's permitted memory allocation (if Boost.regex is
+ configured in non-recursive mode).
Example: the + following function will split the input string into a series of tokens, and + remove each token from the string s: +
+unsigned tokenise(std::list<std::string>& l, std::string& s) +{ + return boost::regex_split(std::back_inserter(l), s); +}+
Example: the + following short program will extract all of the URL's from a html file, and + print them out to cout: +
+#include <list> +#include <fstream> +#include <iostream> +#include <boost/regex.hpp> + +boost::regex e("<\\s*A\\s+[^>]*href\\s*=\\s*\"([^\"]*)\"", + boost::regbase::normal | boost::regbase::icase); + +void load_file(std::string& s, std::istream& is) +{ + s.erase(); + // + // attempt to grow string buffer to match file size, + // this doesn't always work... + s.reserve(is.rdbuf()->in_avail()); + char c; + while(is.get(c)) + { + // use logarithmic growth stategy, in case + // in_avail (above) returned zero: + if(s.capacity() == s.size()) + s.reserve(s.capacity() * 3); + s.append(1, c); + } +} + + +int main(int argc, char** argv) +{ + std::string s; + std::list<std::string> l; + + for(int i = 1; i < argc; ++i) + { + std::cout << "Findings URL's in " << argv[i] << ":" << std::endl; + s.erase(); + std::ifstream is(argv[i]); + load_file(s, is); + boost::regex_split(std::back_inserter(l), s, e); + while(l.size()) + { + s = *(l.begin()); + l.pop_front(); + std::cout << s << std::endl; + } + } + return 0; +}+
Revised + + 26 June 2004 +
+© Copyright John Maddock 1998- + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/regex_token_iterator.html b/doc/regex_token_iterator.html new file mode 100644 index 00000000..9bd59050 --- /dev/null +++ b/doc/regex_token_iterator.html @@ -0,0 +1,381 @@ + + + ++
+ |
+
+ Boost.Regex+regex_token_iterator+ |
+
+ |
+
The template class regex_token_iterator
is an iterator adapter;
+ that is to say it represents a new view of an existing iterator sequence, by
+ enumerating all the occurrences of a regular expression within that sequence,
+ and presenting one or more character sequence for each match found. Each
+ position enumerated by the iterator is a sub_match
+ object that represents what matched a particular sub-expression within the
+ regular expression. When class regex_token_iterator
is used to
+ enumerate a single sub-expression with index -1, then the iterator performs
+ field splitting: that is to say it enumerates one character sequence for each
+ section of the character container sequence that does not match the regular
+ expression specified.
+template <class BidirectionalIterator, + class charT = iterator_traits<BidirectionalIterator>::value_type, + class traits = regex_traits<charT> > +class regex_token_iterator +{ +public: + typedef basic_regex<charT, traits> regex_type; + typedef sub_match<BidirectionalIterator> value_type; + typedef typename iterator_traits<BidirectionalIterator>::difference_type difference_type; + typedef const value_type* pointer; + typedef const value_type& reference; + typedef std::forward_iterator_tag iterator_category; + + regex_token_iterator(); + regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b, const regex_type& re, + int submatch = 0, match_flag_type m = match_default); + regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b, const regex_type& re, + const std::vector<int>& submatches, match_flag_type m = match_default); + template <std::size_t N> + regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b, const regex_type& re, + const int (&submatches)[N], match_flag_type m = match_default); + regex_token_iterator(const regex_token_iterator&); + regex_token_iterator& operator=(const regex_token_iterator&); + bool operator==(const regex_token_iterator&)const; + bool operator!=(const regex_token_iterator&)const; + const value_type& operator*()const; + const value_type* operator->()const; + regex_token_iterator& operator++(); + regex_token_iterator operator++(int); +}; + +typedef regex_token_iterator<const char*> cregex_token_iterator; +typedef regex_token_iterator<std::string::const_iterator> sregex_token_iterator; +#ifndef BOOST_NO_WREGEX +typedef regex_token_iterator<const wchar_t*> wcregex_token_iterator; +typedef regex_token_iterator<<std::wstring::const_iterator> wsregex_token_iterator; +#endif + +template <class charT, class traits> +regex_token_iterator<const charT*, charT, traits> + make_regex_token_iterator(const charT* p, + const basic_regex<charT, traits>& e, + int submatch = 0, + regex_constants::match_flag_type m = regex_constants::match_default); + +template <class charT, class traits, class ST, class SA> +regex_token_iterator<typename std::basic_string<charT, ST, SA>::const_iterator, charT, traits> + make_regex_token_iterator(const std::basic_string<charT, ST, SA>& p, + const basic_regex<charT, traits>& e, + int submatch = 0, + regex_constants::match_flag_type m = regex_constants::match_default); + +template <class charT, class traits, std::size_t N> +regex_token_iterator<const charT*, charT, traits> + make_regex_token_iterator(const charT* p, + const basic_regex<charT, traits>& e, + const int (&submatch)[N], + regex_constants::match_flag_type m = regex_constants::match_default); + +template <class charT, class traits, class ST, class SA, std::size_t N> +regex_token_iterator<typename std::basic_string<charT, ST, SA>::const_iterator, charT, traits> + make_regex_token_iterator(const std::basic_string<charT, ST, SA>& p, + const basic_regex<charT, traits>& e, + const int (&submatch)[N], + regex_constants::match_flag_type m = regex_constants::match_default); + +template <class charT, class traits> +regex_token_iterator<const charT*, charT, traits> + make_regex_token_iterator(const charT* p, + const basic_regex<charT, traits>& e, + const std::vector<int>& submatch, + regex_constants::match_flag_type m = regex_constants::match_default); + +template <class charT, class traits, class ST, class SA> +regex_token_iterator<typename std::basic_string<charT, ST, SA>::const_iterator, charT, traits> + make_regex_token_iterator(const std::basic_string<charT, ST, SA>& p, + const basic_regex<charT, traits>& e, + const std::vector<int>& submatch, + regex_constants::match_flag_type m = regex_constants::match_default); ++
regex_token_iterator();+
Effects: constructs an end of sequence iterator.
+regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b, const regex_type& re, + int submatch = 0, match_flag_type m = match_default);+
Preconditions: !re.empty()
. Object re shall exist
+ for the lifetime of the iterator constructed from it.
Effects: constructs a regex_token_iterator that will enumerate one + string for each regular expression match of the expression re found + within the sequence [a,b), using match flags m. The + string enumerated is the sub-expression submatch for each match + found; if submatch is -1, then enumerates all the text sequences that + did not match the expression re (that is to performs field splitting).
+Throws: std::runtime_error
if the complexity of
+ matching the expression against an N character string begins to exceed O(N2),
+ or if the program runs out of stack space while matching the expression (if
+ Boost.regex is configured in recursive mode),
+ or if the matcher exhausts it's permitted memory allocation (if Boost.regex is
+ configured in non-recursive mode).
regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b, const regex_type& re, + const std::vector<int>& submatches, match_flag_type m = match_default);+
Preconditions: submatches.size() && !re.empty()
.
+ Object re shall exist for the lifetime of the iterator constructed from it.
Effects: constructs a regex_token_iterator that will enumerate submatches.size() + strings for each regular expression match of the expression re found + within the sequence [a,b), using match flags m. For + each match found one string will be enumerated for each sub-expression + index contained within submatches vector; if submatches[0] + is -1, then the first string enumerated for each match will be all of the text + from end of the last match to the start of the current match, in addition there + will be one extra string enumerated when no more matches can be found: from the + end of the last match found, to the end of the underlying sequence.
+Throws: std::runtime_error
if the complexity of
+ matching the expression against an N character string begins to exceed O(N2),
+ or if the program runs out of stack space while matching the expression (if
+ Boost.regex is configured in recursive mode),
+ or if the matcher exhausts it's permitted memory allocation (if Boost.regex is
+ configured in non-recursive mode).
template <std::size_t N> +regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b, const regex_type& re, + const int (&submatches)[R], match_flag_type m = match_default);+
Preconditions: !re.empty()
. Object re shall exist
+ for the lifetime of the iterator constructed from it.
Effects: constructs a regex_token_iterator that will + enumerate R strings for each regular expression match of the + expression re found within the sequence [a,b), using match + flags m. For each match found one string will be + enumerated for each sub-expression index contained within the submatches + array; if submatches[0] is -1, then the first string enumerated + for each match will be all of the text from end of the last match to the start + of the current match, in addition there will be one extra string enumerated + when no more matches can be found: from the end of the last match found, to the + end of the underlying sequence.
+Throws: std::runtime_error
if the complexity of
+ matching the expression against an N character string begins to exceed O(N2),
+ or if the program runs out of stack space while matching the expression (if
+ Boost.regex is configured in recursive mode),
+ or if the matcher exhausts it's permitted memory allocation (if Boost.regex is
+ configured in non-recursive mode).
regex_token_iterator(const regex_token_iterator& that);+
Effects: constructs a copy of that
.
Postconditions: *this == that
.
regex_token_iterator& operator=(const regex_token_iterator& that);+
Effects: sets *this
to be equal to that
.
Postconditions: *this == that
.
bool operator==(const regex_token_iterator&)const;+
+ Effects: returns true if *this is the same position as that.
+bool operator!=(const regex_token_iterator&)const;+
+ Effects: returns !(*this == that)
.
const value_type& operator*()const;+
+ Effects: returns the current character sequence being enumerated.
+const value_type* operator->()const;+
+ Effects: returns &(*this)
.
regex_token_iterator& operator++();+
+ Effects: Moves on to the next character sequence to be enumerated.
+Throws: std::runtime_error
if the complexity of
+ matching the expression against an N character string begins to exceed O(N2),
+ or if the program runs out of stack space while matching the expression (if
+ Boost.regex is configured in recursive mode),
+ or if the matcher exhausts it's permitted memory allocation (if Boost.regex is
+ configured in non-recursive mode).
+ Returns: *this
.
regex_token_iterator& operator++(int);+
Effects: constructs a copy result
of *this
,
+ then calls ++(*this)
.
template <class charT, class traits> +regex_token_iterator<const charT*, charT, traits> + make_regex_token_iterator(const charT* p, + const basic_regex<charT, traits>& e, + int submatch = 0, + regex_constants::match_flag_type m = regex_constants::match_default); + +template <class charT, class traits, class ST, class SA> +regex_token_iterator<typename std::basic_string<charT, ST, SA>::const_iterator, charT, traits> + make_regex_token_iterator(const std::basic_string<charT, ST, SA>& p, + const basic_regex<charT, traits>& e, + int submatch = 0, + regex_constants::match_flag_type m = regex_constants::match_default); + +template <class charT, class traits, std::size_t N> +regex_token_iterator<const charT*, charT, traits> + make_regex_token_iterator(const charT* p, + const basic_regex<charT, traits>& e, + const int (&submatch)[N], + regex_constants::match_flag_type m = regex_constants::match_default); + +template <class charT, class traits, class ST, class SA, std::size_t N> +regex_token_iterator<typename std::basic_string<charT, ST, SA>::const_iterator, charT, traits> + make_regex_token_iterator(const std::basic_string<charT, ST, SA>& p, + const basic_regex<charT, traits>& e, + const int (&submatch)[N], + regex_constants::match_flag_type m = regex_constants::match_default); + +template <class charT, class traits> +regex_token_iterator<const charT*, charT, traits> + make_regex_token_iterator(const charT* p, + const basic_regex<charT, traits>& e, + const std::vector<int>& submatch, + regex_constants::match_flag_type m = regex_constants::match_default); + +template <class charT, class traits, class ST, class SA> +regex_token_iterator<typename std::basic_string<charT, ST, SA>::const_iterator, charT, traits> + make_regex_token_iterator(const std::basic_string<charT, ST, SA>& p, + const basic_regex<charT, traits>& e, + const std::vector<int>& submatch, + regex_constants::match_flag_type m = regex_constants::match_default); ++
Effects: returns a regex_token_iterator that enumerates + one sub_match for each value in submatch for + each occurrence of regular expression e in string p, matched + using match_flags m.
+ +The following example + takes a string and splits it into a series of tokens:
++#include <iostream> +#include <boost/regex.hpp> + +using namespace std; + +int main(int argc) +{ + string s; + do{ + if(argc == 1) + { + cout << "Enter text to split (or \"quit\" to exit): "; + getline(cin, s); + if(s == "quit") break; + } + else + s = "This is a string of tokens"; + + boost::regex re("\\s+"); + boost::sregex_token_iterator i(s.begin(), s.end(), re, -1); + boost::sregex_token_iterator j; + + unsigned count = 0; + while(i != j) + { + cout << *i++ << endl; + count++; + } + cout << "There were " << count << " tokens found." << endl; + + }while(argc == 1); + return 0; +} + ++
The following example + takes a html file and outputs a list of all the linked files:
++#include <fstream> +#include <iostream> +#include <iterator> +#include <boost/regex.hpp> + +boost::regex e("<\\s*A\\s+[^>]*href\\s*=\\s*\"([^\"]*)\"", + boost::regex::normal | boost::regbase::icase); + +void load_file(std::string& s, std::istream& is) +{ + s.erase(); + // + // attempt to grow string buffer to match file size, + // this doesn't always work... + s.reserve(is.rdbuf()->in_avail()); + char c; + while(is.get(c)) + { + // use logarithmic growth stategy, in case + // in_avail (above) returned zero: + if(s.capacity() == s.size()) + s.reserve(s.capacity() * 3); + s.append(1, c); + } +} + +int main(int argc, char** argv) +{ + std::string s; + int i; + for(i = 1; i < argc; ++i) + { + std::cout << "Findings URL's in " << argv[i] << ":" << std::endl; + s.erase(); + std::ifstream is(argv[i]); + load_file(s, is); + boost::sregex_token_iterator i(s.begin(), s.end(), e, 1); + boost::sregex_token_iterator j; + while(i != j) + { + std::cout << *i++ << std::endl; + } + } + // + // alternative method: + // test the array-literal constructor, and split out the whole + // match as well as $1.... + // + for(i = 1; i < argc; ++i) + { + std::cout << "Findings URL's in " << argv[i] << ":" << std::endl; + s.erase(); + std::ifstream is(argv[i]); + load_file(s, is); + const int subs[] = {1, 0,}; + boost::sregex_token_iterator i(s.begin(), s.end(), e, subs); + boost::sregex_token_iterator j; + while(i != j) + { + std::cout << *i++ << std::endl; + } + } + + return 0; +} ++
Revised + + 26 June 2004 +
+© Copyright John Maddock 1998- + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + + diff --git a/doc/regex_traits.html b/doc/regex_traits.html new file mode 100644 index 00000000..dc1708f7 --- /dev/null +++ b/doc/regex_traits.html @@ -0,0 +1,87 @@ + + + ++
+ |
+
+ Boost.Regex+class regex_traits+ |
+
+ |
+
+namespace boost{ + +template <class charT, class implementationT = sensible_default_choice> +struct regex_traits : public implementationT +{ + regex_traits() : implementationT() {} +}; + +template <class charT> +struct c_regex_traits; + +template <class charT> +struct cpp_regex_traits; + +template <class charT> +struct w32_regex_traits; + +} // namespace boost ++
The class regex_traits is just a thin wrapper around an actual implemention + class, which may be one of:
+The default behavior can be altered by defining one of the following + configuration macros in boost/regex/user.hpp:
+All these traits classes fulfil the traits class + requirements.
+Revised + + 24 June 2004 +
+© Copyright John Maddock 1998- + + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/standards.html b/doc/standards.html new file mode 100644 index 00000000..d5bc8c46 --- /dev/null +++ b/doc/standards.html @@ -0,0 +1,237 @@ + + + ++
+ |
+
+ Boost.Regex+Standards Conformance+ |
+
+ |
+
Boost.regex is intended to conform to the + regular expression standardization proposal, which will appear in a + future C++ standard technical report (and hopefully in a future version of the + standard).
+All of the ECMAScript regular expression syntax features are supported, except + that:
+Negated class escapes (\S, \D and \W) are not permitted inside character class + definitions ( [...] ).
+The escape sequence \u matches any upper case character (the same as + [[:upper:]]) rather than a Unicode escape sequence; use \x{DDDD} for + Unicode escape sequences.
+Almost all Perl features are supported, except for:
++
(?{code}) | +Not implementable in a compiled strongly typed language. | +
(??{code}) | +Not implementable in a compiled strongly typed language. | +
All the POSIX basic and extended regular expression features are supported, + except that:
+No character collating names are recognized except those specified in the POSIX + standard for the C locale, unless they are explicitly registered with the + traits class.
+Character equivalence classes ( [[=a=]] etc) are probably buggy except on + Win32. Implementing this feature requires knowledge of the format of the + string sort keys produced by the system; if you need this, and the default + implementation doesn't work on your platform, then you will need to supply a + custom traits class.
+The following comments refer to Unicode + Technical + Standard +#18: Unicode Regular Expressions version 9.
++
# | +Feature | +Support | +
1.1 | +Hex Notation | +Yes: use \x{DDDD} to refer to code point UDDDD. | +
1.2 | +Character Properties | +All the names listed under the General + Category Property are supported. Script names and Other Names are + not currently supported. | +
1.3 | +Subtraction and Intersection | +
+ Indirectly support by forward-lookahead: + +(?=[[:X:]])[[:Y:]] +Gives the intersection of character properties X and Y. +(?![[:X:]])[[:Y:]] +Gives everything in Y that is not in X (subtraction). + |
+
1.4 | +Simple Word Boundaries | +Conforming: non-spacing marks are included in the set of word characters. | +
1.5 | +Caseless Matching | +Supported, note that at this level, case transformations are 1:1, many to many + case folding operations are not supported (for example "ß" to "SS"). | +
1.6 | +Line Boundaries | +Supported, except that "." matches only one character of "\r\n". Other than + that word boundaries match correctly; including not matching in the middle of a + "\r\n" sequence. | +
1.7 | +Code Points | +Supported: provided you use the u32* algorithms, + then UTF-8, UTF-16 and UTF-32 are all treated as sequences of 32-bit code + points. | +
2.1 | +Canonical Equivalence | +Not supported: it is up to the user of the library to convert all text into + the same canonical form as the regular expression. | +
2.2 | +Default Grapheme Clusters | +Not supported. | +
2.3 | ++ + | +Not supported. | +
2.4 | ++ + | +Not Supported. | +
2.5 | +Name Properties | +Supported: the expression "[[:name:]]" or \N{name} matches the named character + "name". | +
2.6 | +Wildcard properties | +Not Supported. | +
3.1 | +Tailored Punctuation. | +Not Supported. | +
3.2 | +Tailored Grapheme Clusters | +Not Supported. | +
3.3 | +Tailored Word Boundaries. | +Not Supported. | +
3.4 | +Tailored Loose Matches | +Partial support: [[=c=]] matches characters with the same primary equivalence + class as "c". | +
3.5 | +Tailored Ranges | +Supported: [a-b] matches any character that collates in the range a to b, when + the expression is constructed with the collate + flag set. | +
3.6 | +Context Matches | +Not Supported. | +
3.7 | +Incremental Matches | +Supported: pass the flag match_partial to + the regex algorithms. | +
3.8 | +Unicode Set Sharing | +Not Supported. | +
3.9 | +Possible Match Sets | +Not supported, however this information is used internally to optimise the + matching of regular expressions, and return quickly if no match is possible. | +
3.10 | +Folded Matching | +Partial Support: It is possible to achieve a similar effect by using a + custom regular expression traits class. | +
3.11 | +Custom Submatch Evaluation | +Not Supported. | +
Revised + + 28 June 2004 +
+© Copyright John Maddock 1998- + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + + + diff --git a/doc/sub_match.html b/doc/sub_match.html new file mode 100644 index 00000000..388d6773 --- /dev/null +++ b/doc/sub_match.html @@ -0,0 +1,571 @@ + + + ++
+ |
+
+ Boost.Regex+sub_match+ |
+
+ |
+
#include <boost/regex.hpp> +
+Regular expressions are different from many simple pattern-matching algorithms + in that as well as finding an overall match they can also produce + sub-expression matches: each sub-expression being delimited in the pattern by a + pair of parenthesis (...). There has to be some method for reporting + sub-expression matches back to the user: this is achieved this by defining a + class match_results that acts as an + indexed collection of sub-expression matches, each sub-expression match being + contained in an object of type sub_match + . +
Objects of type sub_match may only obtained by subscripting an object + of type match_results + . +
Objects of type sub_match may be compared to objects of type std::basic_string, + or const charT* or const charT + . +
Objects of type sub_match may be added to objects of type std::basic_string, + or const charT* or const charT, to produce a new std::basic_string + + object. +
When the marked sub-expression denoted by an object of type sub_match<>
+ participated in a regular expression match then member matched
evaluates
+ to true, and members first
and second
denote the
+ range of characters [first,second)
which formed that match.
+ Otherwise matched
is false, and members first
and second
+ contained undefined values.
When the marked sub-expression denoted by an object of type sub_match<> + was repeated, then the sub_match object represents the match obtained by the + last repeat. The complete set of all the captures obtained for all the + repeats, may be accessed via the captures() member function (Note: this has + serious performance implications, you have to explicitly enable this feature).
+If an object of type sub_match<>
represents sub-expression 0
+ - that is to say the whole match - then member matched
is always
+ true, unless a partial match was obtained as a result of the flag match_partial
+ being passed to a regular expression algorithm, in which case member matched
+ is false, and members first
and second
represent the
+ character range that formed the partial match.
namespace boost{ + +template <class BidirectionalIterator> +class sub_match; + +typedef sub_match<const char*> csub_match; +typedef sub_match<const wchar_t*> wcsub_match; +typedef sub_match<std::string::const_iterator> ssub_match; +typedef sub_match<std::wstring::const_iterator> wssub_match; + +template <class BidirectionalIterator> +class sub_match : public std::pair<BidirectionalIterator, BidirectionalIterator> +{ +public: + typedef typename iterator_traits<BidirectionalIterator>::value_type value_type; + typedef typename iterator_traits<BidirectionalIterator>::difference_type difference_type; + typedef BidirectionalIterator iterator; + + bool matched; + + difference_type length()const; + operator basic_string<value_type>()const; + basic_string<value_type> str()const; + + int compare(const sub_match& s)const; + int compare(const basic_string<value_type>& s)const; + int compare(const value_type* s)const; +#ifdef BOOST_REGEX_MATCH_EXTRA + typedef implementation-private capture_sequence_type; + const capture_sequence_type& captures()const; +#endif +}; +// +// comparisons to another sub_match: +// +template <class BidirectionalIterator> +bool operator == (const sub_match<BidirectionalIterator>& lhs, + const sub_match<BidirectionalIterator>& rhs); +template <class BidirectionalIterator> +bool operator != (const sub_match<BidirectionalIterator>& lhs, + const sub_match<BidirectionalIterator>& rhs); +template <class BidirectionalIterator> +bool operator < (const sub_match<BidirectionalIterator>& lhs, + const sub_match<BidirectionalIterator>& rhs); +template <class BidirectionalIterator> +bool operator <= (const sub_match<BidirectionalIterator>& lhs, + const sub_match<BidirectionalIterator>& rhs); +template <class BidirectionalIterator> +bool operator >= (const sub_match<BidirectionalIterator>& lhs, + const sub_match<BidirectionalIterator>& rhs); +template <class BidirectionalIterator> +bool operator > (const sub_match<BidirectionalIterator>& lhs, + const sub_match<BidirectionalIterator>& rhs); + + +// +// comparisons to a basic_string: +// +template <class BidirectionalIterator, class traits, class Allocator> +bool operator == (const std::basic_string<iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& lhs, + const sub_match<BidirectionalIterator>& rhs); +template <class BidirectionalIterator, class traits, class Allocator> +bool operator != (const std::basic_string<iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& lhs, + const sub_match<BidirectionalIterator>& rhs); +template <class BidirectionalIterator, class traits, class Allocator> +bool operator < (const std::basic_string<iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& lhs, + const sub_match<BidirectionalIterator>& rhs); +template <class BidirectionalIterator, class traits, class Allocator> +bool operator > (const std::basic_string<iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& lhs, + const sub_match<BidirectionalIterator>& rhs); +template <class BidirectionalIterator, class traits, class Allocator> +bool operator >= (const std::basic_string<iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& lhs, + const sub_match<BidirectionalIterator>& rhs); +template <class BidirectionalIterator, class traits, class Allocator> +bool operator <= (const std::basic_string<iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& lhs, + const sub_match<BidirectionalIterator>& rhs); + +template <class BidirectionalIterator, class traits, class Allocator> +bool operator == (const sub_match<BidirectionalIterator>& lhs, + const std::basic_string<iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& rhs); +template <class BidirectionalIterator, class traits, class Allocator> +bool operator != (const sub_match<BidirectionalIterator>& lhs, + const std::basic_string<iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& rhs); +template <class BidirectionalIterator, class traits, class Allocator> +bool operator < (const sub_match<BidirectionalIterator>& lhs, + const std::basic_string<iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& rhs); +template <class BidirectionalIterator, class traits, class Allocator> +bool operator > (const sub_match<BidirectionalIterator>& lhs, + const std::basic_string<iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& rhs); +template <class BidirectionalIterator, class traits, class Allocator> +bool operator >= (const sub_match<BidirectionalIterator>& lhs, + const std::basic_string<iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& rhs); +template <class BidirectionalIterator, class traits, class Allocator> +bool operator <= (const sub_match<BidirectionalIterator>& lhs, + const std::basic_string<iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& rhs); + +// +// comparisons to a pointer to a character array: +// +template <class BidirectionalIterator> +bool operator == (typename iterator_traits<BidirectionalIterator>::value_type const* lhs, + const sub_match<BidirectionalIterator>& rhs); +template <class BidirectionalIterator> +bool operator != (typename iterator_traits<BidirectionalIterator>::value_type const* lhs, + const sub_match<BidirectionalIterator>& rhs); +template <class BidirectionalIterator> +bool operator < (typename iterator_traits<BidirectionalIterator>::value_type const* lhs, + const sub_match<BidirectionalIterator>& rhs); +template <class BidirectionalIterator> +bool operator > (typename iterator_traits<BidirectionalIterator>::value_type const* lhs, + const sub_match<BidirectionalIterator>& rhs); +template <class BidirectionalIterator> +bool operator >= (typename iterator_traits<BidirectionalIterator>::value_type const* lhs, + const sub_match<BidirectionalIterator>& rhs); +template <class BidirectionalIterator> +bool operator <= (typename iterator_traits<BidirectionalIterator>::value_type const* lhs, + const sub_match<BidirectionalIterator>& rhs); + +template <class BidirectionalIterator> +bool operator == (const sub_match<BidirectionalIterator>& lhs, + typename iterator_traits<BidirectionalIterator>::value_type const* rhs); +template <class BidirectionalIterator> +bool operator != (const sub_match<BidirectionalIterator>& lhs, + typename iterator_traits<BidirectionalIterator>::value_type const* rhs); +template <class BidirectionalIterator> +bool operator < (const sub_match<BidirectionalIterator>& lhs, + typename iterator_traits<BidirectionalIterator>::value_type const* rhs); +template <class BidirectionalIterator> +bool operator > (const sub_match<BidirectionalIterator>& lhs, + typename iterator_traits<BidirectionalIterator>::value_type const* rhs); +template <class BidirectionalIterator> +bool operator >= (const sub_match<BidirectionalIterator>& lhs, + typename iterator_traits<BidirectionalIterator>::value_type const* rhs); +template <class BidirectionalIterator> +bool operator <= (const sub_match<BidirectionalIterator>& lhs, + typename iterator_traits<BidirectionalIterator>::value_type const* rhs); + +// +// comparisons to a single character: +// +template <class BidirectionalIterator> +bool operator == (typename iterator_traits<BidirectionalIterator>::value_type const& lhs, + const sub_match<BidirectionalIterator>& rhs); +template <class BidirectionalIterator> +bool operator != (typename iterator_traits<BidirectionalIterator>::value_type const& lhs, + const sub_match<BidirectionalIterator>& rhs); +template <class BidirectionalIterator> +bool operator < (typename iterator_traits<BidirectionalIterator>::value_type const& lhs, + const sub_match<BidirectionalIterator>& rhs); +template <class BidirectionalIterator> +bool operator > (typename iterator_traits<BidirectionalIterator>::value_type const& lhs, + const sub_match<BidirectionalIterator>& rhs); +template <class BidirectionalIterator> +bool operator >= (typename iterator_traits<BidirectionalIterator>::value_type const& lhs, + const sub_match<BidirectionalIterator>& rhs); +template <class BidirectionalIterator> +bool operator <= (typename iterator_traits<BidirectionalIterator>::value_type const& lhs, + const sub_match<BidirectionalIterator>& rhs); + +template <class BidirectionalIterator> +bool operator == (const sub_match<BidirectionalIterator>& lhs, + typename iterator_traits<BidirectionalIterator>::value_type const& rhs); +template <class BidirectionalIterator> +bool operator != (const sub_match<BidirectionalIterator>& lhs, + typename iterator_traits<BidirectionalIterator>::value_type const& rhs); +template <class BidirectionalIterator> +bool operator < (const sub_match<BidirectionalIterator>& lhs, + typename iterator_traits<BidirectionalIterator>::value_type const& rhs); +template <class BidirectionalIterator> +bool operator > (const sub_match<BidirectionalIterator>& lhs, + typename iterator_traits<BidirectionalIterator>::value_type const& rhs); +template <class BidirectionalIterator> +bool operator >= (const sub_match<BidirectionalIterator>& lhs, + typename iterator_traits<BidirectionalIterator>::value_type const& rhs); +template <class BidirectionalIterator> +bool operator <= (const sub_match<BidirectionalIterator>& lhs, + typename iterator_traits<BidirectionalIterator>::value_type const& rhs); +// +// addition operators: +// +template <class BidirectionalIterator, class traits, class Allocator> +std::basic_string<typename iterator_traits<BidirectionalIterator>::value_type, traits, Allocator> + operator + (const std::basic_string<typename iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& s, + const sub_match<BidirectionalIterator>& m); +template <class BidirectionalIterator, class traits, class Allocator> +std::basic_string<typename iterator_traits<BidirectionalIterator>::value_type, traits, Allocator> + operator + (const sub_match<BidirectionalIterator>& m, + const std::basic_string<typename iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& s); +template <class BidirectionalIterator> std::basic_string<typename iterator_traits<BidirectionalIterator>::value_type> + operator + (typename iterator_traits<BidirectionalIterator>::value_type const* s, + const sub_match<BidirectionalIterator>& m); +template <class BidirectionalIterator> std::basic_string<typename iterator_traits<BidirectionalIterator>::value_type> + operator + (const sub_match<BidirectionalIterator>& m, + typename iterator_traits<BidirectionalIterator>::value_type const * s); +template <class BidirectionalIterator> +std::basic_string<typename iterator_traits<BidirectionalIterator>::value_type> + operator + (typename iterator_traits<BidirectionalIterator>::value_type const& s, + const sub_match<BidirectionalIterator>& m); +template <class BidirectionalIterator> +std::basic_string<typename iterator_traits<BidirectionalIterator>::value_type> + operator + (const sub_match<BidirectionalIterator>& m, + typename iterator_traits<BidirectionalIterator>::value_type const& s); +template <class BidirectionalIterator> +std::basic_string<typename iterator_traits<BidirectionalIterator>::value_type> + operator + (const sub_match<BidirectionalIterator>& m1, + const sub_match<BidirectionalIterator>& m2); + +// +// stream inserter: +// +template <class charT, class traits, class BidirectionalIterator> +basic_ostream<charT, traits>& + operator << (basic_ostream<charT, traits>& os, + const sub_match<BidirectionalIterator>& m); + +} // namespace boost+
typedef typename std::iterator_traits<iterator>::value_type value_type;+
The type pointed to by the iterators.
+typedef typename std::iterator_traits<iterator>::difference_type difference_type;+
A type that represents the difference between two iterators.
+typedef iterator iterator_type;+
The iterator type.
+iterator first+
An iterator denoting the position of the start of the match.
+iterator second+
An iterator denoting the position of the end of the match.
+bool matched+
A Boolean value denoting whether this sub-expression participated in the match.
+static difference_type length();+
Effects: returns the length of this matched sub-expression, or 0 if this
+ sub-expression was not matched: matched ? distance(first, second) : 0)
.
operator basic_string<value_type>()const;+
Effects: converts *this into a string: returns (matched ?
+ basic_string<value_type>(first, second) :
+ basic_string<value_type>()).
basic_string<value_type> str()const;+
Effects: returns a string representation of *this: (matched ?
+ basic_string<value_type>(first, second) :
+ basic_string<value_type>())
.
int compare(const sub_match& s)const;+
Effects: performs a lexical comparison to s: returns str().compare(s.str())
.
int compare(const basic_string<value_type>& s)const;+
Effects: compares *this to the string s: returns str().compare(s)
.
int compare(const value_type* s)const;+
Effects: compares *this to the null-terminated string s: returns
+ str().compare(s)
.
typedef implementation-private capture_sequence_type;+
Defines an implementation-specific type that satisfies the requirements of + a standard library Sequence (21.1.1 including the optional Table 68 + operations), whose value_type is a sub_match<BidirectionalIterator>. This + type happens to be std::vector<sub_match<BidirectionalIterator> >, + but you shouldn't actually rely on that.
+const capture_sequence_type& captures()const;+
Effects: returns a sequence containing all the captures + obtained for this sub-expression.
+Preconditions: the library must be built and used with + BOOST_REGEX_MATCH_EXTRA defined, and you must pass the flag + match_extra to the regex matching functions (regex_match, + regex_search, regex_iterator + or regex_token_iterator) in order for + this member function to be defined and return useful information.
+Rationale: Enabling this feature has several consequences: +
+template <class BidirectionalIterator> +bool operator == (const sub_match<BidirectionalIterator>& lhs, + const sub_match<BidirectionalIterator>& rhs);+
Effects: returns lhs.compare(rhs) == 0
.
template <class BidirectionalIterator> +bool operator != (const sub_match<BidirectionalIterator>& lhs, + const sub_match<BidirectionalIterator>& rhs);+
Effects: returns lhs.compare(rhs) != 0
.
template <class BidirectionalIterator> +bool operator < (const sub_match<BidirectionalIterator>& lhs, + const sub_match<BidirectionalIterator>& rhs);+
Effects: returns lhs.compare(rhs) < 0
.
template <class BidirectionalIterator> +bool operator <= (const sub_match<BidirectionalIterator>& lhs, + const sub_match<BidirectionalIterator>& rhs);+
Effects: returns lhs.compare(rhs) <= 0
.
template <class BidirectionalIterator> +bool operator >= (const sub_match<BidirectionalIterator>& lhs, + const sub_match<BidirectionalIterator>& rhs);+
Effects: returns lhs.compare(rhs) >= 0
.
template <class BidirectionalIterator> +bool operator > (const sub_match<BidirectionalIterator>& lhs, + const sub_match<BidirectionalIterator>& rhs);+
Effects: returns lhs.compare(rhs) > 0
.
+template <class BidirectionalIterator, class traits, class Allocator> +bool operator == (const std::basic_string<iterator_traits<BidirectionalIterator>::value_type, traits, + Allocator>& lhs, const sub_match<BidirectionalIterator>& rhs); ++
Effects: returns lhs == rhs.str()
.
template <class BidirectionalIterator, class traits, class Allocator> +bool operator != (const std::basic_string<iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& lhs, + const sub_match<BidirectionalIterator>& rhs);+
Effects: returns lhs != rhs.str()
.
template <class BidirectionalIterator, class traits, class Allocator> +bool operator < (const std::basic_string<iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& lhs, + const sub_match<BidirectionalIterator>& rhs);+
Effects: returns lhs < rhs.str()
.
template <class BidirectionalIterator, class traits, class Allocator> +bool operator > (const std::basic_string<iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& lhs, + const sub_match<BidirectionalIterator>& rhs);+
Effects: returns lhs > rhs.str()
.
template <class BidirectionalIterator, class traits, class Allocator> +bool operator >= (const std::basic_string<iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& lhs, + const sub_match<BidirectionalIterator>& rhs);+
Effects: returns lhs >= rhs.str()
.
template <class BidirectionalIterator, class traits, class Allocator> +bool operator <= (const std::basic_string<iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& lhs, + const sub_match<BidirectionalIterator>& rhs);+
Effects: returns lhs <= rhs.str()
.
template <class BidirectionalIterator, class traits, class Allocator> +bool operator == (const sub_match<BidirectionalIterator>& lhs, + const std::basic_string<iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& rhs);+
Effects: returns lhs.str() == rhs
.
template <class BidirectionalIterator, class traits, class Allocator> +bool operator != (const sub_match<BidirectionalIterator>& lhs, + const std::basic_string<iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& rhs);+
Effects: returns lhs.str() != rhs
.
template <class BidirectionalIterator, class traits, class Allocator> +bool operator < (const sub_match<BidirectionalIterator>& lhs, + const std::basic_string<iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& rhs);+
Effects: returns lhs.str() < rhs
.
template <class BidirectionalIterator, class traits, class Allocator> +bool operator > (const sub_match<BidirectionalIterator>& lhs, + const std::basic_string<iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& rhs);+
Effects: returns lhs.str() > rhs
.
template <class BidirectionalIterator, class traits, class Allocator> +bool operator >= (const sub_match<BidirectionalIterator>& lhs, + const std::basic_string<iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& rhs);+
Effects: returns lhs.str() >= rhs
.
template <class BidirectionalIterator, class traits, class Allocator> +bool operator <= (const sub_match<BidirectionalIterator>& lhs, + const std::basic_string<iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& rhs);+
Effects: returns lhs.str() <= rhs
.
template <class BidirectionalIterator> +bool operator == (typename iterator_traits<BidirectionalIterator>::value_type const* lhs, + const sub_match<BidirectionalIterator>& rhs);+
Effects: returns lhs == rhs.str()
.
template <class BidirectionalIterator> +bool operator != (typename iterator_traits<BidirectionalIterator>::value_type const* lhs, + const sub_match<BidirectionalIterator>& rhs);+
Effects: returns lhs != rhs.str()
.
template <class BidirectionalIterator> +bool operator < (typename iterator_traits<BidirectionalIterator>::value_type const* lhs, + const sub_match<BidirectionalIterator>& rhs);+
Effects: returns lhs < rhs.str()
.
template <class BidirectionalIterator> +bool operator > (typename iterator_traits<BidirectionalIterator>::value_type const* lhs, + const sub_match<BidirectionalIterator>& rhs);+
Effects: returns lhs > rhs.str()
.
template <class BidirectionalIterator> +bool operator >= (typename iterator_traits<BidirectionalIterator>::value_type const* lhs, + const sub_match<BidirectionalIterator>& rhs);+
Effects: returns lhs >= rhs.str()
.
template <class BidirectionalIterator> +bool operator <= (typename iterator_traits<BidirectionalIterator>::value_type const* lhs, + const sub_match<BidirectionalIterator>& rhs);+
Effects: returns lhs <= rhs.str()
.
template <class BidirectionalIterator> +bool operator == (const sub_match<BidirectionalIterator>& lhs, + typename iterator_traits<BidirectionalIterator>::value_type const* rhs);+
Effects: returns lhs.str() == rhs
.
template <class BidirectionalIterator> +bool operator != (const sub_match<BidirectionalIterator>& lhs, + typename iterator_traits<BidirectionalIterator>::value_type const* rhs);+
Effects: returns lhs.str() != rhs
.
template <class BidirectionalIterator> +bool operator < (const sub_match<BidirectionalIterator>& lhs, + typename iterator_traits<BidirectionalIterator>::value_type const* rhs);+
Effects: returns lhs.str() < rhs
.
template <class BidirectionalIterator> +bool operator > (const sub_match<BidirectionalIterator>& lhs, + typename iterator_traits<BidirectionalIterator>::value_type const* rhs);+
Effects: returns lhs.str() > rhs
.
template <class BidirectionalIterator> +bool operator >= (const sub_match<BidirectionalIterator>& lhs, + typename iterator_traits<BidirectionalIterator>::value_type const* rhs);+
Effects: returns lhs.str() >= rhs
.
template <class BidirectionalIterator> +bool operator <= (const sub_match<BidirectionalIterator>& lhs, + typename iterator_traits<BidirectionalIterator>::value_type const* rhs);+
Effects: returns lhs.str() <= rhs
.
template <class BidirectionalIterator> +bool operator == (typename iterator_traits<BidirectionalIterator>::value_type const& lhs, + const sub_match<BidirectionalIterator>& rhs);+
Effects: returns lhs == rhs.str()
.
template <class BidirectionalIterator> +bool operator != (typename iterator_traits<BidirectionalIterator>::value_type const& lhs, + const sub_match<BidirectionalIterator>& rhs);+
Effects: returns lhs != rhs.str()
.
template <class BidirectionalIterator> +bool operator < (typename iterator_traits<BidirectionalIterator>::value_type const& lhs, + const sub_match<BidirectionalIterator>& rhs);+
Effects: returns lhs < rhs.str()
.
template <class BidirectionalIterator> +bool operator > (typename iterator_traits<BidirectionalIterator>::value_type const& lhs, + const sub_match<BidirectionalIterator>& rhs);+
Effects: returns lhs > rhs.str()
.
template <class BidirectionalIterator> +bool operator >= (typename iterator_traits<BidirectionalIterator>::value_type const& lhs, + const sub_match<BidirectionalIterator>& rhs);+
Effects: returns lhs >= rhs.str()
.
template <class BidirectionalIterator> +bool operator <= (typename iterator_traits<BidirectionalIterator>::value_type const& lhs, + const sub_match<BidirectionalIterator>& rhs);+
Effects: returns lhs <= rhs.str()
.
template <class BidirectionalIterator> +bool operator == (const sub_match<BidirectionalIterator>& lhs, + typename iterator_traits<BidirectionalIterator>::value_type const& rhs);+
Effects: returns lhs.str() == rhs
.
template <class BidirectionalIterator> +bool operator != (const sub_match<BidirectionalIterator>& lhs, + typename iterator_traits<BidirectionalIterator>::value_type const& rhs);+
Effects: returns lhs.str() != rhs
.
template <class BidirectionalIterator> +bool operator < (const sub_match<BidirectionalIterator>& lhs, + typename iterator_traits<BidirectionalIterator>::value_type const& rhs);+
Effects: returns lhs.str() < rhs
.
template <class BidirectionalIterator> +bool operator > (const sub_match<BidirectionalIterator>& lhs, + typename iterator_traits<BidirectionalIterator>::value_type const& rhs);+
Effects: returns lhs.str() > rhs
.
template <class BidirectionalIterator> +bool operator >= (const sub_match<BidirectionalIterator>& lhs, + typename iterator_traits<BidirectionalIterator>::value_type const& rhs);+
Effects: returns lhs.str() >= rhs
.
template <class BidirectionalIterator> +bool operator <= (const sub_match<BidirectionalIterator>& lhs, + typename iterator_traits<BidirectionalIterator>::value_type const& rhs);+
Effects: returns lhs.str() <= rhs
.
The addition operators for sub_match allow you to add a sub_match to any type + to which you can add a std::string and obtain a new string as the result.
+template <class BidirectionalIterator, class traits, class Allocator> +std::basic_string<typename iterator_traits<BidirectionalIterator>::value_type, traits, Allocator> + operator + (const std::basic_string<typename iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& s, + const sub_match<BidirectionalIterator>& m);+
Effects: returns s + m.str()
.
template <class BidirectionalIterator, class traits, class Allocator> +std::basic_string<typename iterator_traits<BidirectionalIterator>::value_type, traits, Allocator> + operator + (const sub_match<BidirectionalIterator>& m, + const std::basic_string<typename iterator_traits<BidirectionalIterator>::value_type, traits, Allocator>& s);+
Effects: returns m.str() + s
.
template <class BidirectionalIterator> std::basic_string<typename iterator_traits<BidirectionalIterator>::value_type> + operator + (typename iterator_traits<BidirectionalIterator>::value_type const* s, + const sub_match<BidirectionalIterator>& m);+
Effects: returns s + m.str()
.
template <class BidirectionalIterator> std::basic_string<typename iterator_traits<BidirectionalIterator>::value_type> + operator + (const sub_match<BidirectionalIterator>& m, + typename iterator_traits<BidirectionalIterator>::value_type const * s);+
Effects: returns m.str() + s
.
template <class BidirectionalIterator> +std::basic_string<typename iterator_traits<BidirectionalIterator>::value_type> + operator + (typename iterator_traits<BidirectionalIterator>::value_type const& s, + const sub_match<BidirectionalIterator>& m);+
Effects: returns s + m.str()
.
template <class BidirectionalIterator> +std::basic_string<typename iterator_traits<BidirectionalIterator>::value_type> + operator + (const sub_match<BidirectionalIterator>& m, + typename iterator_traits<BidirectionalIterator>::value_type const& s);+
Effects: returns m.str() + s
.
template <class BidirectionalIterator> +std::basic_string<typename iterator_traits<BidirectionalIterator>::value_type> + operator + (const sub_match<BidirectionalIterator>& m1, + const sub_match<BidirectionalIterator>& m2);+
Effects: returns m1.str() + m2.str()
.
template <class charT, class traits, class BidirectionalIterator> +basic_ostream<charT, traits>& + operator << (basic_ostream<charT, traits>& os + const sub_match<BidirectionalIterator>& m);+
+ Effects: returns (os << m.str())
.
+
Revised + + 22 Dec 2004 +
+© Copyright John Maddock 1998- + + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/syntax.html b/doc/syntax.html new file mode 100644 index 00000000..e2b62f87 --- /dev/null +++ b/doc/syntax.html @@ -0,0 +1,55 @@ + + + ++
+ |
+
+ Boost.Regex+Regular Expression Syntax+ |
+
+ |
+
This section covers the regular expression syntax used by this library, this is + a programmers guide, the actual syntax presented to your program's users will + depend upon the flags used during + expression compilation. +
+There are three main syntax options available, depending upon how + you construct the regular expression object:
+You can also construct a regular expression that treats every character as a + literal, but that's not really a "syntax"!
+Revised + + 10 Sept 2004 +
+© Copyright John Maddock 1998- + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/syntax_basic.html b/doc/syntax_basic.html new file mode 100644 index 00000000..14096c38 --- /dev/null +++ b/doc/syntax_basic.html @@ -0,0 +1,238 @@ + + + ++
+ |
+
+ Boost.Regex+POSIX Basic Regular Expression Syntax+ |
+
+ |
+
The POSIX-Basic regular expression syntax is used by the Unix utility sed, + and variations are used by grep and emacs. You can + construct POSIX basic regular expressions in Boost.Regex by passing the flag basic + to the regex constructor, for example:
+// e1 is a case sensitive POSIX-Basic expression: +boost::regex e1(my_expression, boost::regex::basic); +// e2 a case insensitive POSIX-Basic expression: +boost::regex e2(my_expression, boost::regex::basic|boost::regex::icase);+
In POSIX-Basic regular expressions, all characters are match themselves except + for the following special characters:
+.[\*^$+
The single character '.' when used outside of a character set will match any + single character except:
+The NULL character when the flag match_no_dot_null is passed to the + matching algorithms.
+The newline character when the flag match_not_dot_newline is passed to + the matching algorithms.
+A '^' character shall match the start of a line when used as the first + character of an expression, or the first character of a sub-expression.
+A '$' character shall match the end of a line when used as the last character + of an expression, or the last character of a sub-expression.
+A section beginning \( and ending \) acts as a marked sub-expression. + Whatever matched the sub-expression is split out in a separate field by the + matching algorithms. Marked sub-expressions can also repeated, or + referred-to by a back-reference.
+Any atom (a single character, a marked sub-expression, or a character class) + can be repeated with the * operator.
+For example a* will match any number of letter a's repeated zero or more times + (an atom repeated zero times matches an empty string), so the expression a*b + will match any of the following:
+b +ab +aaaaaaaab+
An atom can also be repeated with a bounded repeat:
+a\{n\} Matches 'a' repeated exactly n times.
+a\{n,\} Matches 'a' repeated n or more times.
+a\{n, m\} Matches 'a' repeated between n and m times + inclusive.
+For example:
+^a\{2,3\}$+
Will match either of:
+aa +aaa+
But neither of:
+a +aaaa+
It is an error to use a repeat operator, if the preceding construct can not be + repeated, for example:
+a\(*\)+
Will raise an error, as there is nothing for the * operator to be applied to.
+An escape character followed by a digit n, where n is in the + range 1-9, matches the same string that was matched by sub-expression n. + For example the expression:
+^\(a*\).*\1$+
Will match the string:
+aaabbaaa+
But not the string:
+aaabba+
A character set is a bracket-expression starting with [ and ending with ], it + defines a set of characters, and matches any single character that is a member + of that set.
+A bracket expression may contain any combination of the following:
+++Single characters:
+For example [abc], will match any of the characters 'a', 'b', or 'c'.
+Character ranges:
+For example [a-c] will match any single character in the range 'a' to + 'c'. By default, for POSIX-Basic regular expressions, a character x + is within the range y to z, if it collates within that + range; this results in locale specific behavior. This behavior can + be turned off by unsetting the collate + option flag - in which case whether a character appears within a range is + determined by comparing the code points of the characters only
+Negation:
+If the bracket-expression begins with the ^ character, then it matches the + complement of the characters it contains, for example [^a-c] matches any + character that is not in the range a-c.
+Character classes:
+An expression of the form [[:name:]] matches the named character class "name", + for example [[:lower:]] matches any lower case character. See + character class names.
+Collating Elements:
+An expression of the form [[.col.] matches the collating element col. + A collating element is any single character, or any sequence of characters that + collates as a single unit. Collating elements may also be used as the end + point of a range, for example: [[.ae.]-c] matches the character sequence "ae", + plus any single character in the rangle "ae"-c, assuming that "ae" is treated + as a single collating element in the current locale.
+Collating elements may be used in place of escapes (which are not normally + allowed inside character sets), for example [[.^.]abc] would match either one + of the characters 'abc^'.
+As an extension, a collating element may also be specified via its + symbolic name, for example:
+[[.NUL.]]
+matches a NUL character.
+Equivalence classes:
++ An expression of theform[[=col=]], matches any character or collating element + whose primary sort key is the same as that for collating element col, + as with collating elements the name col may be a + symbolic name. A primary sort key is one that ignores case, + accentation, or locale-specific tailorings; so for example [[=a=]] matches any + of the characters: a, à, á, â, ã, ä, å, A, À, Á, Â, Ã, Ä and Å. + Unfortunately implementation of this is reliant on the platform's collation and + localisation support; this feature can not be relied upon to work portably + across all platforms, or even all locales on one platform.
+
All of the above can be combined in one character set declaration, for example: + [[:digit:]a-c[.NUL.]].
+With the exception of the escape sequences \{, \}, \(, and \), which are + documented above, an escape followed by any character matches that + character. This can be used to make the special characters .[\*^$, + "ordinary". Note that the escape character loses its special meaning + inside a character set, so [\^] will match either a literal '\' or a '^'.
+When there is more that one way to match a regular expression, the "best" + possible match is obtained using the leftmost-longest + rule.
+When an expression is compiled with the flag grep set, then the + expression is treated as a newline separated list of POSIX-Basic + expressions, a match is found if any of the expressions in the list match, for + example:
+boost::regex e("abc\ndef", boost::regex::grep);+
will match either of the POSIX-Basic expressions "abc" or "def".
+As its name suggests, this behavior is consistent with the Unix utility grep.
+In addition to the POSIX-Basic features the following + characters are also special:
++++ repeats the preceding atom one or more times.
+? repeats the preceding atom zero or one times.
+*? A non-greedy version of *.
++? A non-greedy version of +.
+?? A non-greedy version of ?.
+
And the following escape sequences are also recognised:
+++\| specifies an alternative.
+\(?: ... \) is a non-marking grouping construct - allows you to + lexically group something without spitting out an extra sub-expression.
+\w matches any word character.
+\W matches any non-word character.
+\sx matches any character in the syntax group x, the following emacs + groupings are supported: 's', ' ', '_', 'w', '.', ')', '(', '"', '\'', '>' + and '<'. Refer to the emacs docs for details.
+\Sx matches any character not in the syntax grouping x.
+\c and \C are not supported.
+\` matches zero characters only at the start of a buffer (or string being + matched).
+\' matches zero characters only at the end of a buffer (or string being + matched).
+\b matches zero characters at a word boundary.
+\B matches zero characters, not at a word boundary.
+\< matches zero characters only at the start of a word.
+\> matches zero characters only at the end of a word.
+
Finally, you should note that emacs style regular expressions are + matched according to the Perl "depth first search" + rules. Emacs expressions are matched this way because they contain + Perl-like extensions, that do not interact well with the + POSIX-style leftmost-longest rule.
+There are a variety of flags that + may be combined with the basic and grep options when + constructing the regular expression, in particular note that the + newline_alt, no_char_classes, no-intervals, bk_plus_qm and bk_plus_vbar options + all alter the syntax, while the collate + and icase options modify how the case and locale sensitivity are to be + applied.
++
Revised + + 21 Aug 2004 +
+© Copyright John Maddock 2004
+ +Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt).
+ + + diff --git a/doc/syntax_extended.html b/doc/syntax_extended.html new file mode 100644 index 00000000..d9253166 --- /dev/null +++ b/doc/syntax_extended.html @@ -0,0 +1,520 @@ + + + ++
+ |
+
+ Boost.Regex+POSIX-Extended Regular Expression Syntax+ |
+
+ |
+
The POSIX-Extended regular expression syntax is supported by the POSIX C + regular expression API's, and variations are used by the utilities egrep + and awk. You can construct POSIX extended regular expressions in + Boost.Regex by passing the flag extended to the regex constructor, for + example:
+// e1 is a case sensitive POSIX-Extended expression: +boost::regex e1(my_expression, boost::regex::extended); +// e2 a case insensitive POSIX-Extended expression: +boost::regex e2(my_expression, boost::regex::extended|boost::regex::icase);+
In POSIX-Extended regular expressions, all characters match themselves except + for the following special characters:
+.[{()\*+?|^$+
The single character '.' when used outside of a character set will match any + single character except:
+The NULL character when the flag match_no_dot_null is passed to the + matching algorithms.
+The newline character when the flag match_not_dot_newline is passed to + the matching algorithms.
+A '^' character shall match the start of a line when used as the first + character of an expression, or the first character of a sub-expression.
+A '$' character shall match the end of a line when used as the last character + of an expression, or the last character of a sub-expression.
+A section beginning ( and ending ) acts as a marked sub-expression. + Whatever matched the sub-expression is split out in a separate field by the + matching algorithms. Marked sub-expressions can also repeated, or + referred to by a back-reference.
+Any atom (a single character, a marked sub-expression, or a character class) + can be repeated with the *, +, ?, and {} operators.
+The * operator will match the preceding atom zero or more times, for example + the expression a*b will match any of the following:
+b +ab +aaaaaaaab+
The + operator will match the preceding atom one or more times, for example the + expression a+b will match any of the following:
+ab +aaaaaaaab+
But will not match:
+b+
The ? operator will match the preceding atom zero or one times, for + example the expression ca?b will match any of the following:
+cb +cab+
But will not match:
+caab+
An atom can also be repeated with a bounded repeat:
+a{n} Matches 'a' repeated exactly n times.
+a{n,} Matches 'a' repeated n or more times.
+a{n, m} Matches 'a' repeated between n and m times + inclusive.
+For example:
+^a{2,3}$+
Will match either of:
+aa +aaa+
But neither of:
+a +aaaa+
It is an error to use a repeat operator, if the preceding construct can not be + repeated, for example:
+a(*)+
Will raise an error, as there is nothing for the * operator to be applied to.
+An escape character followed by a digit n, where n is in the + range 1-9, matches the same string that was matched by sub-expression n. + For example the expression:
+^(a*).*\1$+
Will match the string:
+aaabbaaa+
But not the string:
+aaabba+
Caution: the POSIX standard does not support back-references + for "extended" regular expressions, this is a compatible extension to that + standard.
+The | operator will match either of its arguments, so for example: abc|def will + match either "abc" or "def". +
+Parenthesis can be used to group alternations, for example: ab(d|ef) will match + either of "abd" or "abef".
+A character set is a bracket-expression starting with [ and ending with ], it + defines a set of characters, and matches any single character that is a member + of that set.
+A bracket expression may contain any combination of the following:
+++Single characters:
+For example [abc], will match any of the characters 'a', 'b', or 'c'.
+Character ranges:
+For example [a-c] will match any single character in the range 'a' to + 'c'. By default, for POSIX-Extended regular expressions, a character x + is within the range y to z, if it collates within that + range; this results in locale specific behavior . + This behavior can be turned off by unsetting the + collate option flag - in which case whether a character appears + within a range is determined by comparing the code points of the characters + only.
+Negation:
+If the bracket-expression begins with the ^ character, then it matches the + complement of the characters it contains, for example [^a-c] matches any + character that is not in the range a-c.
+Character classes:
+An expression of the form [[:name:]] matches the named character class "name", + for example [[:lower:]] matches any lower case character. See + character class names.
+Collating Elements:
+An expression of the form [[.col.] matches the collating element col. + A collating element is any single character, or any sequence of characters that + collates as a single unit. Collating elements may also be used as the end + point of a range, for example: [[.ae.]-c] matches the character sequence "ae", + plus any single character in the range "ae"-c, assuming that "ae" is treated as + a single collating element in the current locale.
+Collating elements may be used in place of escapes (which are not normally + allowed inside character sets), for example [[.^.]abc] would match either one + of the characters 'abc^'.
+As an extension, a collating element may also be specified via its + symbolic name, for example:
+[[.NUL.]]
+matches a NUL character.
+Equivalence classes:
++ An expression oftheform[[=col=]], matches any character or collating element + whose primary sort key is the same as that for collating element col, + as with colating elements the name col may be a + symbolic name. A primary sort key is one that ignores case, + accentation, or locale-specific tailorings; so for example [[=a=]] matches any + of the characters: a, à, á, â, ã, ä, å, A, À, Á, Â, Ã, Ä and Å. + Unfortunately implementation of this is reliant on the platform's collation and + localisation support; this feature can not be relied upon to work portably + across all platforms, or even all locales on one platform.
+
All of the above can be combined in one character set declaration, for example: + [[:digit:]a-c[.NUL.]].
+The POSIX standard defines no escape sequences for POSIX-Extended regular + expressions, except that:
+However, that's rather restrictive, so the following standard-compatible + extensions are also supported by Boost.Regex:
+++Escapes matching a specific character
+The following escape sequences are all synonyms for single characters:
++
+
+ ++ +Escape +Character ++ +\a +'\a' ++ +\e +0x1B ++ +\f +\f ++ +\n +\n ++ +\r +\r ++ +\t +\t ++ +\v +\v ++ +\b +\b (but only inside a character class declaration). ++ +\cX +An ASCII escape sequence - the character whose code point is X % 32 ++ +\xdd +A hexadecimal escape sequence - matches the single character whose code point + is 0xdd. ++ +\x{dddd} +A hexadecimal escape sequence - matches the single character whose code point + is 0xdddd. ++ +\0ddd +An octal escape sequence - matches the single character whose code point is + 0ddd. ++ +\N{Name} +Matches the single character which has the symbolic + name name. For example \N{newline} matches the single + character \n. +"Single character" character classes:
+Any escaped character x, if x is the name of a character + class shall match any character that is a member of that class, and any escaped + character X, if x is the name of a character class, shall + match any character not in that class.
+The following are supported by default:
++
+
+ ++ +Escape sequence +Equivalent to ++ +\d +[[:digit:]] ++ +\l +[[:lower:]] ++ +\s +[[:space:]] ++ +\u +[[:upper:]] ++ +\w +[[:word:]] ++ +\D +[^[:digit:]] ++ +\L +[^[:lower:]] ++ +\S +[^[:space:]] ++ +\U +[^[:upper:]] ++ +\W +[^[:word:]] ++
+Character Properties
+The character property names in the following table are all + equivalent to the names used in character + classes.
++
+
+ ++ +Form +Description +Equivalent character set form ++ +\pX +Matches any character that has the property X. +[[:X:]] ++ +\p{Name} +Matches any character that has the property Name. +[[:Name:]] ++ +\PX +Matches any character that does not have the property X. +[^[:X:]] ++ +\P{Name} +Matches any character that does not have the property Name. +[^[:Name:]] +Word Boundaries
+The following escape sequences match the boundaries of words:
++
+
+ ++ +\< +Matches the start of a word. ++ +\> +Matches the end of a word. ++ +\b +Matches a word boundary (the start or end of a word). ++ +\B +Matches only when not at a word boundary. +Buffer boundaries
+The following match only at buffer boundaries: a "buffer" in this context is + the whole of the input text that is being matched against (note that ^ and + $ may match embedded newlines within the text).
++
+
+ ++ +\` +Matches at the start of a buffer only. ++ +\' +Matches at the end of a buffer only. ++ +\A +Matches at the start of a buffer only (the same as \`). ++ +\z +Matches at the end of a buffer only (the same as \'). ++ +\Z +Matches an optional sequence of newlines at the end of a buffer: equivalent to + the regular expression \n*\z +Continuation Escape
+The sequence \G matches only at the end of the last match found, or at the + start of the text being matched if no previous match was found. This + escape useful if you're iterating over the matches contained within a text, and + you want each subsequence match to start where the last one ended.
+Quoting escape
+The escape sequence \Q begins a "quoted sequence": all the subsequent + characters are treated as literals, until either the end of the regular + expression or \E is found. For example the expression: \Q\*+\Ea+ would + match either of:
+\*+a+
\*+aaaUnicode escapes
++
+
+ ++ +\C +Matches a single code point: in Boost regex this has exactly the same effect + as a "." operator. ++ +\X +Matches a combining character sequence: that is any non-combining character + followed by a sequence of zero or more combining characters. +Any other escape
+Any other escape sequence matches the character that is escaped, for example \@ + matches a literal '@'.
+
The order of precedence for of operators is as shown in the following + table:
++
Collation-related bracket symbols | +[==] [::] [..] | +
Escaped characters + | +\ | +
Character set (bracket expression) + | +[] | +
Grouping | +() | +
Single-character-ERE duplication + | +* + ? {m,n} | +
Concatenation | ++ |
Anchoring | +^$ | +
Alternation | +| | +
When there is more that one way to match a regular expression, the "best" + possible match is obtained using the leftmost-longest + rule.
+When an expression is compiled with the flag egrep set, then the + expression is treated as a newline separated list of POSIX-Extended + expressions, a match is found if any of the expressions in the list match, for + example:
+boost::regex e("abc\ndef", boost::regex::egrep);+
will match either of the POSIX-Basic expressions "abc" or "def".
+As its name suggests, this behavior is consistent with the Unix utility egrep, + and with grep when used with the -E option.
+In addition to the POSIX-Extended features the + escape character is special inside a character class declaration.
+In addition, some escape sequences that are not defined as part of + POSIX-Extended specification are required to be supported - however Boost.Regex + supports these by default anyway.
+There are a variety of flags that + may be combined with the extended and egrep options when + constructing the regular expression, in particular note that the + newline_alt option alters the syntax, while the + collate, nosubs and icase options modify how the case and locale + sensitivity are to be applied.
+Revised + + 21 Aug 2004 +
+© Copyright John Maddock 2004
+ +Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt).
+ + + diff --git a/doc/syntax_leftmost_longest.html b/doc/syntax_leftmost_longest.html new file mode 100644 index 00000000..6330fc5f --- /dev/null +++ b/doc/syntax_leftmost_longest.html @@ -0,0 +1,65 @@ + + + ++
+ |
+
+ Boost.Regex+The "Leftmost Longest" Rule+ |
+
+ |
+
Often there is more than one way of matching a regular expression at a + particular location, for POSIX basic and extended regular expressions, the + "best" match is determined as follows:
++
Revised + + 16 Dec 2004
+© Copyright John Maddock 1998- + 2003
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + + diff --git a/doc/syntax_option_type.html b/doc/syntax_option_type.html new file mode 100644 index 00000000..fbd5cdbe --- /dev/null +++ b/doc/syntax_option_type.html @@ -0,0 +1,543 @@ + + + ++
+ |
+
+ Boost.Regex+syntax_option_type+ |
+
+ |
+
Type syntax_option type is an implementation specific bitmask type that + controls how a regular expression string is to be interpreted. For + convenience note that all the constants listed here, are also duplicated within + the scope of class template basic_regex.
+namespace std{ namespace regex_constants{ + +typedef implementation-specific-bitmask-type syntax_option_type;+
+// these flags are standardized: +static const syntax_option_type normal; +static const syntax_option_type ECMAScript = normal; +static const syntax_option_type JavaScript = normal; +static const syntax_option_type JScript = normal; +static const syntax_option_type perl = normal;
static const syntax_option_type basic; +static const syntax_option_type sed = basic; +static const syntax_option_type extended; +static const syntax_option_type awk; +static const syntax_option_type grep; +static const syntax_option_type egrep; +static const syntax_option_type icase; +static const syntax_option_type nosubs; +static const syntax_option_type optimize; +static const syntax_option_type collate; +// other boost.regex specific options are listed below
+} // namespace regex_constants +} // namespace std
The type syntax_option_type
is an implementation specific bitmask
+ type (17.3.2.1.2). Setting its elements has the effects listed in the table
+ below, a valid value of type syntax_option_type
will always have
+ exactly one of the elements normal, basic, extended, awk, grep, egrep, sed,
+ literal or perl
set.
Note that for convenience all the constants listed here are duplicated within + the scope of class template basic_regex, so you can use any of:
+boost::regex_constants::constant_name+
or
+boost::regex::constant_name+
or
+boost::wregex::constant_name+
in an interchangeable manner.
+One of the following must always be set for perl regular expressions:
++
Element | +Standardized | +Effect when set | +
+ ECMAScript + |
+ Yes | +
+ Specifies that the grammar recognized by the regular expression engine uses its + normal semantics: that is the same as that given in the ECMA-262, ECMAScript + Language Specification, Chapter 15 part 10, RegExp (Regular Expression) Objects + (FWD.1). +boost.regex also recognizes all of the perl-compatible (?...) extensions in + this mode. + |
+
perl | +No | +As above. | +
normal | +No | +As above. | +
JavaScript | +No | +As above. | +
JScript | +No | +As above. | +
The following options may also be set when using perl-style regular + expressions:
++
Element | +Standardized | +Effect when set | +
icase | +Yes | +
+ Specifies that matching of regular expressions against a character container + sequence shall be performed without regard to case. + |
+
nosubs | +Yes | +
+ Specifies that when a regular expression is matched against a character + container sequence, then no sub-expression matches are to be stored in the + supplied match_results structure. + |
+
optimize | +Yes | +
+ Specifies that the regular expression engine should pay more attention to the + speed with which regular expressions are matched, and less to the speed with + which regular expression objects are constructed. Otherwise it has no + detectable effect on the program output. This currently has no effect for + Boost.Regex. + |
+
collate | +Yes | +
+ Specifies that character ranges of the form "[a-b]" should be locale sensitive. + |
+
newline_alt | +No | +Specifies that the \n character has the same effect as the alternation + operator |. Allows newline separated lists to be used as a list of + alternatives. | +
no_except | +No | +Prevents basic_regex from throwing an exception when an invalid expression is + encountered. | +
no_mod_m | +No | +Normally Boost.Regex behaves as if the Perl m-modifier is on: so the + assertions ^ and $ match after and before embedded newlines respectively, + setting this flags is equivalent to prefixing the expression with (?-m). | +
no_mod_s | +No | +Normally whether Boost.Regex will match "." against a newline character is + determined by the match flag match_dot_not_newline. + Specifying this flag is equivalent to prefixing the expression with (?-s) and + therefore causes "." not to match a newline character regardless of whether + match_not_dot_newline is set in the match flags. | +
mod_s | +No | +Normally whether Boost.Regex will match "." against a newline character is + determined by the match flag match_dot_not_newline. + Specifying this flag is equivalent to prefixing the expression with (?s) and + therefore causes "." to match a newline character regardless of whether + match_not_dot_newline is set in the match flags. | +
mod_x | +No | +Turns on the perl x-modifier: causes unescaped whitespace in the expression to + be ignored. | +
Exactly one of the following must always be set for POSIX extended regular + expressions:
++
Element | +Standardized | +Effect when set | +
extended | +Yes | +
+ Specifies that the grammar recognized by the regular expression engine is the + same as that used by POSIX extended regular expressions in IEEE Std + 1003.1-2001, Portable Operating System Interface (POSIX ), Base Definitions and + Headers, Section 9, Regular Expressions (FWD.1). + +In addition some perl-style escape sequences are supported (The POSIX standard + specifies that only "special" characters may be escaped, all other escape + sequences result in undefined behavior). + |
+
egrep | +Yes | +
+ Specifies that the grammar recognized by the regular expression engine is the + same as that used by POSIX utility grep when given the -E option in IEEE Std + 1003.1-2001, Portable Operating System Interface (POSIX ), Shells and + Utilities, Section 4, Utilities, grep (FWD.1). +That is to say, the same as POSIX extended syntax, but with the newline + character acting as an alternation character in addition to "|". + |
+
awk | +Yes | +
+ Specifies that the grammar recognized by the regular expression engine is the + same as that used by POSIX utility awk in IEEE Std 1003.1-2001, Portable + Operating System Interface (POSIX ), Shells and Utilities, Section 4, awk + (FWD.1). +That is to say: the same as POSIX extended syntax, but with escape sequences in + character classes permitted. +In addition some perl-style escape sequences are supported (actually the awk + syntax only requires \a \b \t \v \f \n and \r to be recognised, all other + Perl-style escape sequences invoke undefined behavior according to the POSIX + standard, but are in fact recognised by Boost.Regex). + |
+
The following options may also be set when using POSIX extended regular + expressions:
++
Element | +Standardized | +Effect when set | +||
icase | +Yes | +
+ Specifies that matching of regular expressions against a character container + sequence shall be performed without regard to case. + |
+ ||
nosubs | +Yes | +
+ Specifies that when a regular expression is matched against a character + container sequence, then no sub-expression matches are to be stored in the + supplied match_results structure. + |
+ ||
optimize | +Yes | +
+ Specifies that the regular expression engine should pay more attention to the + speed with which regular expressions are matched, and less to the speed with + which regular expression objects are constructed. Otherwise it has no + detectable effect on the program output. This currently has no effect for + boost.regex. + |
+ ||
collate | +Yes | +
+ Specifies that character ranges of the form "[a-b]" should be locale + sensitive. This bit is on by default for + POSIX-Extended regular expressions, but can be unset to force ranges to be + compared by code point only. + |
+ ||
newline_alt | +No | +Specifies that the \n character has the same effect as the alternation + operator |. Allows newline separated lists to be used as a list of + alternatives. | +||
no_escape_in_lists | +No | +When set this makes the escape character ordinary inside lists, so that [\b] + would match either '\' or 'b'. This bit is one by default for + POSIX-Extended regular expressions, but can be unset to force escapes to be + recognised inside lists. | +||
no_bk_refs | +No | +When set then backreferences are disabled. This bit is + on by default for POSIX-Extended regular expressions, but can be + unset to support for backreferences on. | +||
no_except | ++ | + | No | +Prevents basic_regex from throwing an exception when an invalid expression is + encountered. | +
Exactly one of the following must always be set for POSIX basic regular + expressions:
++
Element | +Standardized | +Effect When Set | +
basic | +Yes | +
+ Specifies that the grammar recognized by the regular expression engine is the + same as that used by POSIX basic regular + expressions in IEEE Std 1003.1-2001, Portable Operating System Interface + (POSIX ), Base Definitions and Headers, Section 9, Regular Expressions (FWD.1). + + |
+
sed | +No | +As Above. | +
grep | +Yes | +
+ Specifies that the grammar recognized by the regular expression engine is the + same as that used by POSIX utility grep in + IEEE Std 1003.1-2001, Portable Operating System Interface (POSIX ), Shells and + Utilities, Section 4, Utilities, grep (FWD.1). +That is to say, the same as POSIX basic syntax, but with the newline character + acting as an alternation character; the expression is treated as a newline + separated list of alternatives. + |
+
emacs | +No | +Specifies that the grammar recognised is the superset of the POSIX-Basic + syntax used by the emacs program. | +
The following options may also be set when using POSIX basic regular + expressions:
++
Element | +Standardized | +Effect when set | +||
icase | +Yes | +
+ Specifies that matching of regular expressions against a character container + sequence shall be performed without regard to case. + |
+ ||
nosubs | +Yes | +
+ Specifies that when a regular expression is matched against a character + container sequence, then no sub-expression matches are to be stored in the + supplied match_results structure. + |
+ ||
optimize | +Yes | +
+ Specifies that the regular expression engine should pay more attention to the + speed with which regular expressions are matched, and less to the speed with + which regular expression objects are constructed. Otherwise it has no + detectable effect on the program output. This currently has no effect for + boost.regex. + |
+ ||
collate | +Yes | +
+ Specifies that character ranges of the form "[a-b]" should be locale + sensitive. This bit is on by default for + POSIX-Basic regular expressions, but can be unset to force ranges to be + compared by code point only. + |
+ ||
newline_alt | +No | +Specifies that the \n character has the same effect as the alternation + operator |. Allows newline separated lists to be used as a list of + alternatives. This bit is already set, if you use the grep option. | +||
no_char_classes | +No | +When set then character classes such as [[:alnum:]] are not allowed. | +||
no_escape_in_lists | +No | +When set this makes the escape character ordinary inside lists, so that [\b] + would match either '\' or 'b'. This bit is one by default for + POSIX-basic regular expressions, but can be unset to force escapes to be + recognised inside lists. | +||
no_intervals | +No | +When set then bounded repeats such as a{2,3} are not permitted. | +||
bk_plus_qm | +No | +When set then \? acts as a zero-or-one repeat operator, and \+ acts as a + one-or-more repeat operator. | +||
bk_vbar | +No | +When set then \| acts as the alternation operator. | +||
no_except | ++ | + | No | +Prevents basic_regex from throwing an exception when an invalid expression is + encountered. | +
The following must always be set to interpret the expression as a string + literal:
++
Element | +Standardized | +Effect when set | +
literal | +Yes | +Treat the string as a literal (no special characters). | +
The following options may also be combined with the literal flag:
++
Element | +Standardized | +Effect when set | +
icase | +Yes | +
+ Specifies that matching of regular expressions against a character container + sequence shall be performed without regard to case. + |
+
optimize | +Yes | +
+ Specifies that the regular expression engine should pay more attention to the + speed with which regular expressions are matched, and less to the speed with + which regular expression objects are constructed. Otherwise it has no + detectable effect on the program output. This currently has no effect for + boost.regex. + |
+
+
Revised + + 23 June 2004 +
+© Copyright John Maddock 1998- + 2004
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/syntax_perl.html b/doc/syntax_perl.html new file mode 100644 index 00000000..3eda0385 --- /dev/null +++ b/doc/syntax_perl.html @@ -0,0 +1,626 @@ + + + ++
+ |
+
+ Boost.Regex++ Perl Regular Expression Syntax+ |
+
+ |
+
The Perl regular expression syntax is based on that used by the programming + language Perl . Perl regular expressions are the default + behavior in Boost.Regex or you can pass the flag perl to the + regex constructor, for example:
+// e1 is a case sensitive Perl regular expression: +// since Perl is the default option there's no need to explicitly specify the syntax used here: +boost::regex e1(my_expression); +// e2 a case insensitive Perl regular expression: +boost::regex e2(my_expression, boost::regex::perl|boost::regex::icase);+
In Perl regular expressions, all characters match themselves except for + the following special characters:
+.[{()\*+?|^$+
The single character '.' when used outside of a character set will match any + single character except:
+The NULL character when the flag match_no_dot_null is passed to the + matching algorithms.
+The newline character when the flag match_not_dot_newline is passed to + the matching algorithms.
+A '^' character shall match the start of a line.
+A '$' character shall match the end of a line.
+A section beginning ( and ending ) acts as a marked sub-expression. + Whatever matched the sub-expression is split out in a separate field by the + matching algorithms. Marked sub-expressions can also repeated, or + referred to by a back-reference.
+A marked sub-expression is useful to lexically group part of a regular + expression, but has the side-effect of spitting out an extra field in the + result. As an alternative you can lexically group part of a regular + expression, without generating a marked sub-expression by using (?: and ) , for + example (?:ab)+ will repeat "ab" without splitting out any separate + sub-expressions.
+Any atom (a single character, a marked sub-expression, or a character class) + can be repeated with the *, +, ?, and {} operators.
+The * operator will match the preceding atom zero or more times, for example + the expression a*b will match any of the following:
+b +ab +aaaaaaaab+
The + operator will match the preceding atom one or more times, for example the + expression a+b will match any of the following:
+ab +aaaaaaaab+
But will not match:
+b+
The ? operator will match the preceding atom zero or one times, for + example the expression ca?b will match any of the following:
+cb +cab+
But will not match:
+caab+
An atom can also be repeated with a bounded repeat:
+a{n} Matches 'a' repeated exactly n times.
+a{n,} Matches 'a' repeated n or more times.
+a{n, m} Matches 'a' repeated between n and m times + inclusive.
+For example:
+^a{2,3}$+
Will match either of:
+aa +aaa+
But neither of:
+a +aaaa+
It is an error to use a repeat operator, if the preceding construct can not be + repeated, for example:
+a(*)+
Will raise an error, as there is nothing for the * operator to be applied to.
+The normal repeat operators are "greedy", that is to say they will consume as + much input as possible. There are non-greedy versions available that will + consume as little input as possible while still producing a match.
+*? Matches the previous atom zero or more times, while consuming as little + input as possible.
++? Matches the previous atom one or more times, while consuming as little input + as possible.
+?? Matches the previous atom zero or one times, while consuming as little input + as possible.
+{n,}? Matches the previous atom n or more times, while consuming + as little input as possible.
+{n,m}? Matches the previous atom between n and m times, + while consuming as little input as possible.
+An escape character followed by a digit n, where n is in the + range 1-9, matches the same string that was matched by sub-expression n. + For example the expression:
+^(a*).*\1$+
Will match the string:
+aaabbaaa+
But not the string:
+aaabba+
The | operator will match either of its arguments, so for example: abc|def will + match either "abc" or "def". +
+Parenthesis can be used to group alternations, for example: ab(d|ef) will match + either of "abd" or "abef".
+Empty alternatives are not allowed (these are almost always a mistake), + but if you really want an empty alternative use (?:) as a placeholder, for + example:
+++"|abc" is not a valid expression, but
+
+ "(?:)|abc" is and is equivalent, also the expression:
+ "(?:abc)??" has exactly the same effect.
A character set is a bracket-expression starting with [ and ending with ], it + defines a set of characters, and matches any single character that is a member + of that set.
+A bracket expression may contain any combination of the following:
+++Single characters:
+For example [abc], will match any of the characters 'a', 'b', or 'c'.
+Character ranges:
+For example [a-c] will match any single character in the range 'a' to + 'c'. By default, for POSIX-Perl regular expressions, a character x + is within the range y to z, if it collates within that + range; this results in locale specific behavior. This behavior can + be turned off by unsetting the collate + option flag - in which case whether a character appears within a range is + determined by comparing the code points of the characters only
+Negation:
+If the bracket-expression begins with the ^ character, then it matches the + complement of the characters it contains, for example [^a-c] matches any + character that is not in the range a-c.
+Character classes:
+An expression of the form [[:name:]] matches the named character class "name", + for example [[:lower:]] matches any lower case character. See + character class names.
+Collating Elements:
+An expression of the form [[.col.] matches the collating element col. + A collating element is any single character, or any sequence of characters that + collates as a single unit. Collating elements may also be used as the end + point of a range, for example: [[.ae.]-c] matches the character sequence "ae", + plus any single character in the range "ae"-c, assuming that "ae" is treated as + a single collating element in the current locale.
+As an extension, a collating element may also be specified via it's + symbolic name, for example:
+[[.NUL.]]
+matches a NUL character.
+Equivalence classes:
++ An expression oftheform[[=col=]], matches any character or collating element + whose primary sort key is the same as that for collating element col, + as with colating elements the name col may be a + symbolic name. A primary sort key is one that ignores case, + accentation, or locale-specific tailorings; so for example [[=a=]] matches any + of the characters: a, à, á, â, ã, ä, å, A, À, Á, Â, Ã, Ä and Å. + Unfortunately implementation of this is reliant on the platform's collation and + localisation support; this feature can not be relied upon to work portably + across all platforms, or even all locales on one platform.
+Escapes:
+All the escape sequences that match a single character, or a single character + class are permitted within a character class definition, except the + negated character classes (\D \W etc).
+
All of the above can be combined in one character set declaration, for example: + [[:digit:]a-c[.NUL.]].
+Any special character preceded by an escape shall match itself. +
+The following escape sequences are also supported:
+++Escapes matching a specific character
+The following escape sequences are all synonyms for single characters:
++
+
+ ++ +Escape +Character ++ +\a +'\a' ++ +\e +0x1B ++ +\f +\f ++ +\n +\n ++ +\r +\r ++ +\t +\t ++ +\v +\v ++ +\b +\b (but only inside a character class declaration). ++ +\cX +An ASCII escape sequence - the character whose code point is X % 32 ++ +\xdd +A hexadecimal escape sequence - matches the single character whose code point + is 0xdd. ++ +\x{dddd} +A hexadecimal escape sequence - matches the single character whose code point + is 0xdddd. ++ +\0ddd +An octal escape sequence - matches the single character whose code point is + 0ddd. ++ +\N{name} +Matches the single character which has the symbolic + name name. For example \N{newline} matches the single + character \n. +"Single character" character classes:
+Any escaped character x, if x is the name of a character + class shall match any character that is a member of that class, and any escaped + character X, if x is the name of a character class, shall + match any character not in that class.
+The following are supported by default:
++
+
+ ++ +Escape sequence +Equivalent to ++ +\d +[[:digit:]] ++ +\l +[[:lower:]] ++ +\s +[[:space:]] ++ +\u +[[:upper:]] ++ +\w +[[:word:]] ++ +\D +[^[:digit:]] ++ +\L +[^[:lower:]] ++ +\S +[^[:space:]] ++ +\U +[^[:upper:]] ++ +\W +[^[:word:]] +Character Properties
+The character property names in the following table are all equivalent to the + names used in character classes.
++
+
+ ++ +Form +Description +Equivalent character set form ++ +\pX +Matches any character that has the property X. +[[:X:]] ++ +\p{Name} +Matches any character that has the property Name. +[[:Name:]] ++ +\PX +Matches any character that does not have the property X. +[^[:X:]] ++ +\P{Name} +Matches any character that does not have the property Name. +[^[:Name:]] +Word Boundaries
+The following escape sequences match the boundaries of words:
++
+
+ ++ +\< +Matches the start of a word. ++ +\> +Matches the end of a word. ++ +\b +Matches a word boundary (the start or end of a word). ++ +\B +Matches only when not at a word boundary. +Buffer boundaries
+The following match only at buffer boundaries: a "buffer" in this context is + the whole of the input text that is being matched against (note that ^ and + $ may match embedded newlines within the text).
++
+
+ ++ +\` +Matches at the start of a buffer only. ++ +\' +Matches at the end of a buffer only. ++ +\A +Matches at the start of a buffer only (the same as \`). ++ +\z +Matches at the end of a buffer only (the same as \'). ++ +\Z +Matches an optional sequence of newlines at the end of a buffer: equivalent to + the regular expression \n*\z +Continuation Escape
+The sequence \G matches only at the end of the last match found, or at the + start of the text being matched if no previous match was found. This + escape useful if you're iterating over the matches contained within a text, and + you want each subsequence match to start where the last one ended.
+Quoting escape
+The escape sequence \Q begins a "quoted sequence": all the subsequent + characters are treated as literals, until either the end of the regular + expression or \E is found. For example the expression: \Q\*+\Ea+ would + match either of:
+\*+a+
\*+aaaUnicode escapes
++
+
+ ++ +\C +Matches a single code point: in Boost regex this has exactly the same effect + as a "." operator. ++ +\X +Matches a combining character sequence: that is any non-combining character + followed by a sequence of zero or more combining characters. +Any other escape
+Any other escape sequence matches the character that is escaped, for example \@ + matches a literal '@'.
+
Perl-specific extensions to the regular expression syntax all start + with (?.
+++Comments
+(?# ... ) is treated as a comment, it's contents are ignored.
+Modifiers
+(?imsx-imsx ... ) alters which of the perl modifiers are in effect + within the pattern, changes take effect from the point that the block is first + seen and extend to any enclosing ). Letters before a '-' turn that perl + modifier on, letters afterward, turn it off.
+(?imsx-imsx:pattern) applies the specified modifiers to pattern + only.
+Non-marking grouping
+(?:pattern) lexically groups pattern, without generating an + additional sub-expression.
+Lookahead
+(?=pattern) consumes zero characters, only if pattern matches.
+(?!pattern) consumes zero characters, only if pattern does + not match.
+Lookahead is typically used to create the logical AND of two regular + expressions, for example if a password must contain a lower case letter, an + upper case letter, a punctuation symbol, and be at least 6 characters long, + then the expression:
+(?=.*[[:lower:]])(?=.*[[:upper:]])(?=.*[[:punct:]]).{6,}+could be used to validate the password.
+Lookbehind
+(?<=pattern) consumes zero characters, only if pattern could + be matched against the characters preceding the current position (pattern + must be of fixed length).
+(?<!pattern) consumes zero characters, only if pattern could + not be matched against the characters preceding the current position (pattern + must be of fixed length).
+Independent sub-expressions
+(?>pattern) pattern is matched independently of the + surrounding patterns, the expression will never backtrack into pattern. + Independent sub-expressions are typically used to improve performance; only the + best possible match for pattern will be considered, if this doesn't + allow the expression as a whole to match then no match is found at all.
+Conditional Expressions
+(?(condition)yes-pattern|no-pattern) attempts to match yes-pattern + if the condition is true, otherwise attempts to match no-pattern.
+(?(condition)yes-pattern) attempts to match yes-pattern if + the condition is true, otherwise fails.
+Condition may be either a forward lookahead assert, or the + index of a marked sub-expression (the condition becomes true if the + sub-expression has been matched).
+
The order of precedence for of operators is as shown in the following + table:
++
Collation-related bracket symbols | +[==] [::] [..] | +
Escaped characters + | +\ | +
Character set (bracket expression) + | +[] | +
Grouping | +() | +
Single-character-ERE duplication + | +* + ? {m,n} | +
Concatenation | ++ |
Anchoring | +^$ | +
Alternation | +| | +
If you view the regular expression as a directed (possibly cyclic) graph, then + the best match found is the first match found by a depth-first-search performed + on that graph, while matching the input text.
+Alternatively:
+the best match found is the leftmost match, with individual elements matched as + follows;
++
Construct | +What gets matches | +
AtomA AtomB | +Locates the best match for AtomA that has a following match for AtomB. | +
Expression1 | Expression2 | +If Expresion1 can be matched then returns that match, otherwise attempts to + match Expression2. | +
S{N} | +Matches S repeated exactly N times. | +
S{N,M} | +Matches S repeated between N and M times, and as many times as possible. | +
S{N,M}? | +Matches S repeated between N and M times, and as few times as possible. | +
S?, S*, S+ | + The same as S{0,1} , S{0,UINT_MAX} ,
+ S{1,UINT_MAX} respectively.
+ |
+
S??, S*?, S+? | +The same as S{0,1}? , S{0,UINT_MAX}? , S{1,UINT_MAX}?
+ respectively.
+ |
+
(?>S) + | +Matches the best match for S, and only that. | +
+ (?=S), (?<=S) + | +Matches only the best match for S (this is only visible if there are capturing + parenthesis within S). | +
(?!S), (?<!S) | +Considers only whether a match for S exists or not. | +
(?(condition)yes-pattern | no-pattern) | +If condition is true, then only yes-pattern is considered, + otherwise only no-pattern is considered. | +
The options normal, ECMAScript, JavaScript + and JScript are all synonyms for Perl.
+There are a variety of flags that + may be combined with the Perl option when constructing the regular + expression, in particular note that the newline_alt + option alters the syntax, while the collate, + nosubs and icase options modify how the case and locale sensitivity + are to be applied.
+The perl smix modifiers can either be applied using a (?smix-smix) + prefix to the regular expression, or with one of the regex-compile time flags + no_mod_m, mod_x, mod_s, and no_mod_s. +
+Revised + + 21 Aug 2004 +
+© Copyright John Maddock 2004
+ +Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt).
+ + + diff --git a/doc/thread_safety.html b/doc/thread_safety.html new file mode 100644 index 00000000..894a7688 --- /dev/null +++ b/doc/thread_safety.html @@ -0,0 +1,70 @@ + + + ++
+ |
+
+ Boost.Regex+Thread Safety+ |
+
+ |
+
The regex library is thread safe when Boost is: you can verify that Boost is in + thread safe mode by checking to see if BOOST_HAS_THREADS is defined: this macro + is set automatically by the config system when threading support is turned on + in your compiler. +
+Class basic_regex<> and its typedefs regex + and wregex are thread safe, in that compiled regular expressions can safely be + shared between threads. The matching algorithms regex_match, + regex_search, regex_grep, + regex_format and regex_merge + are all re-entrant and thread safe. Class match_results + is now thread safe, in that the results of a match can be safely copied from + one thread to another (for example one thread may find matches and push + match_results instances onto a queue, while another thread pops them off the + other end), otherwise use a separate instance of match_results + per thread. +
+The POSIX API functions are all re-entrant and + thread safe, regular expressions compiled with regcomp can also be + shared between threads. +
+The class RegEx is only thread safe if each thread + gets its own RegEx instance (apartment threading) - this is a consequence of + RegEx handling both compiling and matching regular expressions. +
+Finally note that changing the global locale invalidates all compiled regular + expressions, therefore calling set_locale from one thread while another + uses regular expressions will produce unpredictable results. +
++ There is also a requirement that there is only one thread executing prior to + the start of main().
+Revised + + 24 Oct 2003 +
+© Copyright John Maddock 1998- + + 2003
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + diff --git a/doc/uarrow.gif b/doc/uarrow.gif new file mode 100644 index 00000000..6afd20c3 Binary files /dev/null and b/doc/uarrow.gif differ diff --git a/doc/unicode.html b/doc/unicode.html new file mode 100644 index 00000000..9e22faf3 --- /dev/null +++ b/doc/unicode.html @@ -0,0 +1,66 @@ + + + ++
+ |
+
+ Boost.Regex+Unicode Regular Expressions.+ |
+
+ |
+
There are two ways to use Boost.Regex with Unicode strings:
+If your platform's wchar_t type can hold Unicode strings, and your + platform's C/C++ runtime correctly handles wide character constants (when + passed to std::iswspace std::iswlower etc), then you can use boost::wregex to + process Unicode. However, there are several disadvantages to this + approach:
+If you have the ICU + library, then Boost.Regex can be configured + to make use of it, and provide a distinct regular expression type + (boost::u32regex), that supports both Unicode specific character properties, + and the searching of text that is encoded in either UTF-8, UTF-16, or + UTF-32. See: ICU string class support.
++
Revised + + 04 Jan 2005 +
+© Copyright John Maddock 2005
+Use, modification and distribution are subject to the Boost Software License, + Version 1.0. (See accompanying file LICENSE_1_0.txt + or copy at http://www.boost.org/LICENSE_1_0.txt)
+ + +