Updated Unicode compatibility, and finished off the docs.

[SVN r26681]
This commit is contained in:
John Maddock
2005-01-12 12:44:02 +00:00
parent f996efb1e1
commit 16a494ca8f
18 changed files with 707 additions and 28 deletions

View File

@ -24,10 +24,17 @@
</P>
<HR>
<p></p>
<P>
The following are treated as valid digraphs when used as a collating name:</P>
<H3>Contents</H3>
<dl class="index">
<dt><A href="#digraphs">Digraphs</A></dt>
<dt><A href="#posix">POSIX Symbolic Names</A></dt>
<dt><A href="#unicode">Unicode Symbolic Names</A></dt>
</dl>
<H3><A name="digraphs"></A>Digraphs</H3>
<P>The following are treated as valid digraphs when used as a collating name:</P>
<P>"ae", "Ae", "AE", "ch", "Ch", "CH", "ll", "Ll", "LL", "ss", "Ss", "SS", "nj",
"Nj", "NJ", "dz", "Dz", "DZ", "lj", "Lj", "LJ".</P>
<H3><A name="posix"></A>POSIX Symbolic Names</H3>
<P>The following symbolic names are recognised as valid collating element names,
in addition to any single character:</P>
<P>
@ -342,15 +349,18 @@
</TR>
</TABLE>
</P>
<P>
<H3><A name="unicode"></A>Named Unicode Characters</H3>
<P>When using <A href="icu_strings.html">Unicode aware regular expressions</A> (with
the <EM>u32regex </EM>type), all the normal symbolic names for Unicode
characters (those given in Unidata.txt) are recognised.</P>
<P>
<HR>
</P>
<P></P>
<p>Revised
<!--webbot bot="Timestamp" S-Type="EDITED" S-Format="%d %B, %Y" startspan -->
24 Oct 2003
<!--webbot bot="Timestamp" endspan i-checksum="39359" --></p>
<p><i><EFBFBD> Copyright John Maddock&nbsp;1998-
<!--webbot bot="Timestamp" S-Type="EDITED" S-Format="%Y" startspan --> 2003<!--webbot bot="Timestamp" endspan i-checksum="39359" --></i></p>
<p>Revised 12 Jan 2005
<!--webbot bot="Timestamp" S-Type="EDITED" S-Format="%d %B, %Y" startspan --></p>
<p><i><EFBFBD> Copyright John Maddock&nbsp;2004-2005</i></p>
<P><I>Use, modification and distribution are subject to the Boost Software License,
Version 1.0. (See accompanying file <A href="../../../LICENSE_1_0.txt">LICENSE_1_0.txt</A>
or copy at <A href="http://www.boost.org/LICENSE_1_0.txt">http://www.boost.org/LICENSE_1_0.txt</A>)</I></P>

View File

@ -40,7 +40,10 @@
<LI>
Added <A href="mfc_strings.html">MFC/ATL string wrappers</A>.
<LI>
Added <A href="unicode.html">Unicode support; based on ICU</A>.</LI></UL>
Added <A href="unicode.html">Unicode support; based on ICU</A>.
<LI>
Changed newline support to recognise \f as a line separator (all character
types), and \x85 as a line separator for wide characters / Unicode only.</LI></UL>
<P>Boost 1.32.1.</P>
<UL>
<LI>

View File

@ -61,6 +61,166 @@
string sort keys produced by the system; if you need this, and the default
implementation doesn't work on your platform, then you will need to supply a
custom traits class.</P>
<H3>Unicode</H3>
<P>The following comments refer to&nbsp;<A href="http://www.unicode.org/reports/tr18/">Unicode
Technical
<SPAN>Standard
</SPAN>#18: Unicode Regular Expressions</A>&nbsp;version 9.</P>
<P>
<TABLE id="Table3" cellSpacing="1" cellPadding="1" width="100%" border="0">
<TR>
<TD>#</TD>
<TD>Feature</TD>
<TD>Support</TD>
</TR>
<TR>
<TD>1.1</TD>
<TD>Hex Notation</TD>
<TD>Yes: use \x{DDDD} to refer to code point UDDDD.</TD>
</TR>
<TR>
<TD>1.2</TD>
<TD>Character Properties</TD>
<TD>All the names listed under the&nbsp;<A href="http://www.unicode.org/reports/tr18/#Categories">General
Category Property</A> are supported.&nbsp; Script names and Other Names are
not currently supported.</TD>
</TR>
<TR>
<TD>1.3</TD>
<TD><A name="Subtraction_and_Intersection">Subtraction</A> and Intersection</TD>
<TD>
<P>Indirectly support by forward-lookahead:
</P>
<P>(?=[[:X:]])[[:Y:]]</P>
<P>Gives the intersection of character properties X and Y.</P>
<P>(?![[:X:]])[[:Y:]]</P>
<P>Gives everything in Y that is not in X (subtraction).</P>
</TD>
</TR>
<TR>
<TD>1.4</TD>
<TD><A name="Simple_Word_Boundaries">Simple Word Boundaries</A></TD>
<TD>Conforming: non-spacing marks are included in the set of word characters.</TD>
</TR>
<TR>
<TD>1.5</TD>
<TD>Caseless Matching</TD>
<TD>Supported, note that at this level, case transformations are 1:1, many to many
case folding operations are not supported (for example&nbsp;"<22>" to "SS").</TD>
</TR>
<TR>
<TD>1.6</TD>
<TD>Line Boundaries</TD>
<TD>Supported, except that "." matches only one character of "\r\n". Other than
that word boundaries match correctly; including not matching in the middle of a
"\r\n" sequence.</TD>
</TR>
<TR>
<TD>1.7</TD>
<TD>Code Points</TD>
<TD>Supported: provided you use the <A href="icu_string.html">u32* algorithms</A>,
then UTF-8, UTF-16 and UTF-32 are all treated as sequences of 32-bit code
points.</TD>
</TR>
<TR>
<TD>2.1</TD>
<TD>Canonical Equivalence</TD>
<TD>Not supported: it is up to the user of the library to convert all text into
the same canonical form as the regular expression.</TD>
</TR>
<TR>
<TD>2.2</TD>
<TD>Default Grapheme Clusters</TD>
<TD>Not supported.</TD>
</TR>
<TR>
<TD>2.3</TD>
<TD><!--StartFragment -->
<P><A name="Default_Word_Boundaries">Default Word Boundaries</A></P>
</TD>
<TD>Not supported.</TD>
</TR>
<TR>
<TD>2.4</TD>
<TD><!--StartFragment -->
<P><A name="Default_Loose_Matches">Default Loose Matches</A></P>
</TD>
<TD>Not Supported.</TD>
</TR>
<TR>
<TD>2.5</TD>
<TD>Name Properties</TD>
<TD>Supported: the expression "[[:name:]]" or \N{name} matches the named character
"name".</TD>
</TR>
<TR>
<TD>2.6</TD>
<TD>Wildcard properties</TD>
<TD>Not Supported.</TD>
</TR>
<TR>
<TD>3.1</TD>
<TD>Tailored Punctuation.</TD>
<TD>Not Supported.</TD>
</TR>
<TR>
<TD>3.2</TD>
<TD>Tailored Grapheme Clusters</TD>
<TD>Not Supported.</TD>
</TR>
<TR>
<TD>3.3</TD>
<TD>Tailored Word Boundaries.</TD>
<TD>Not Supported.</TD>
</TR>
<TR>
<TD>3.4</TD>
<TD>Tailored Loose Matches</TD>
<TD>Partial support: [[=c=]] matches characters with the same primary equivalence
class as "c".</TD>
</TR>
<TR>
<TD>3.5</TD>
<TD>Tailored Ranges</TD>
<TD>Supported: [a-b] matches any character that collates in the range a to b, when
the expression is constructed with the <A href="syntax_option_type.html">collate</A>
flag set.</TD>
</TR>
<TR>
<TD>3.6</TD>
<TD>Context Matches</TD>
<TD>Not Supported.</TD>
</TR>
<TR>
<TD>3.7</TD>
<TD>Incremental Matches</TD>
<TD>Supported: pass the flag <A href="match_flag_type.html">match_partial</A> to
the regex algorithms.</TD>
</TR>
<TR>
<TD>3.8</TD>
<TD>Unicode Set Sharing</TD>
<TD>Not Supported.</TD>
</TR>
<TR>
<TD>3.9</TD>
<TD>Possible Match Sets</TD>
<TD>Not supported, however this information is used internally to optimise the
matching of regular expressions, and return quickly if no match is possible.</TD>
</TR>
<TR>
<TD>3.10</TD>
<TD>Folded Matching</TD>
<TD>Partial Support:&nbsp; It is possible to achieve a similar effect by using a
custom regular expression traits class.</TD>
</TR>
<TR>
<TD>3.11</TD>
<TD>Custom&nbsp;Submatch Evaluation</TD>
<TD>Not Supported.</TD>
</TR>
</TABLE>
</P>
<HR>
<p>Revised&nbsp;
<!--webbot bot="Timestamp" S-Type="EDITED" S-Format="%d %B, %Y" startspan -->
@ -73,3 +233,4 @@
or copy at <A href="http://www.boost.org/LICENSE_1_0.txt">http://www.boost.org/LICENSE_1_0.txt</A>)</I></P>
</body>
</html>

View File

@ -155,7 +155,7 @@ aaaa</PRE>
<P>matches a NUL character.</P>
<H5>Equivalence classes:</H5>
<P>
An expression of the form[[=col=]], matches any character or collating element
An expression of theform[[=col=]], matches any character or collating element
whose primary sort key is the same as that for collating element <EM>col</EM>,
as with colating elements the name <EM>col</EM> may be a <A href="collating_names.html">
symbolic name</A>.&nbsp; A primary sort key is one that ignores case,
@ -242,6 +242,12 @@ aaaa</PRE>
<TD>An octal escape sequence - matches the single character whose code point is
0ddd.</TD>
</TR>
<TR>
<TD>\N{Name}</TD>
<TD>Matches the single character which has the <A href="collating_names.html">symbolic
name</A> <EM>name.&nbsp; </EM>For example \N{newline} matches the single
character \n.</TD>
</TR>
</TABLE>
</P>
<H5>"Single character" character&nbsp;classes:</H5>
@ -298,6 +304,41 @@ aaaa</PRE>
</TR>
</TABLE>
</P>
<H5>
<H5>Character Properties</H5>
</H5>
<P dir="ltr">The character property names in the following table are all
equivalent to the <A href="character_class_names.html">names used in character
classes</A>.</P>
<H5>
<TABLE id="Table9" cellSpacing="1" cellPadding="1" width="100%" border="0">
<TR>
<TD><STRONG>Form</STRONG></TD>
<TD><STRONG>Description</STRONG></TD>
<TD><STRONG>Equivalent character set form</STRONG></TD>
</TR>
<TR>
<TD>\pX</TD>
<TD>Matches any character that has the property X.</TD>
<TD>[[:X:]]</TD>
</TR>
<TR>
<TD>\p{Name}</TD>
<TD>Matches any character that has the property <EM>Name</EM>.</TD>
<TD>[[:Name:]]</TD>
</TR>
<TR>
<TD>\PX</TD>
<TD>Matches any character that does not have the property X.</TD>
<TD>[^[:X:]]</TD>
</TR>
<TR>
<TD>\P{Name}</TD>
<TD>Matches any character that does not have the property <EM>Name</EM>.</TD>
<TD>[^[:Name:]]</TD>
</TR>
</TABLE>
</H5>
<H5>Word Boundaries</H5>
<P>The following escape sequences match the boundaries of words:</P>
<P>

View File

@ -169,7 +169,7 @@ aaaa</PRE>
<P>matches a NUL character.</P>
<H5>Equivalence classes:</H5>
<P>
An expression of the form[[=col=]], matches any character or collating element
An expression of theform[[=col=]], matches any character or collating element
whose primary sort key is the same as that for collating element <EM>col</EM>,
as with colating elements the name <EM>col</EM> may be a <A href="collating_names.html">
symbolic name</A>.&nbsp; A primary sort key is one that ignores case,
@ -250,6 +250,12 @@ aaaa</PRE>
<TD>An octal escape sequence - matches the single character whose code point is
0ddd.</TD>
</TR>
<TR>
<TD>\N{name}</TD>
<TD>Matches the single character which has the <A href="collating_names.html">symbolic
name</A> <EM>name.&nbsp; </EM>For example \N{newline} matches the single
character \n.</TD>
</TR>
</TABLE>
</P>
<H5>"Single character" character&nbsp;classes:</H5>
@ -306,6 +312,38 @@ aaaa</PRE>
</TR>
</TABLE>
</P>
<H5>Character Properties</H5>
<P>The character property names in the following table are all equivalent to the <A href="character_class_names.html">
names used in character classes</A>.</P>
<P>
<TABLE id="Table9" cellSpacing="1" cellPadding="1" width="100%" border="0">
<TR>
<TD><STRONG>Form</STRONG></TD>
<TD><STRONG>Description</STRONG></TD>
<TD><STRONG>Equivalent character set form</STRONG></TD>
</TR>
<TR>
<TD>\pX</TD>
<TD>Matches any character that has the property X.</TD>
<TD>[[:X:]]</TD>
</TR>
<TR>
<TD>\p{Name}</TD>
<TD>Matches any character that has the property <EM>Name</EM>.</TD>
<TD>[[:Name:]]</TD>
</TR>
<TR>
<TD>\PX</TD>
<TD>Matches any character that does not have the property X.</TD>
<TD>[^[:X:]]</TD>
</TR>
<TR>
<TD>\P{Name}</TD>
<TD>Matches any character that does not have the property <EM>Name</EM>.</TD>
<TD>[^[:Name:]]</TD>
</TR>
</TABLE>
</P>
<H5>Word Boundaries</H5>
<P>The following escape sequences match the boundaries of words:</P>
<P>

View File

@ -24,10 +24,17 @@
</P>
<HR>
<p></p>
<P>
The following are treated as valid digraphs when used as a collating name:</P>
<H3>Contents</H3>
<dl class="index">
<dt><A href="#digraphs">Digraphs</A></dt>
<dt><A href="#posix">POSIX Symbolic Names</A></dt>
<dt><A href="#unicode">Unicode Symbolic Names</A></dt>
</dl>
<H3><A name="digraphs"></A>Digraphs</H3>
<P>The following are treated as valid digraphs when used as a collating name:</P>
<P>"ae", "Ae", "AE", "ch", "Ch", "CH", "ll", "Ll", "LL", "ss", "Ss", "SS", "nj",
"Nj", "NJ", "dz", "Dz", "DZ", "lj", "Lj", "LJ".</P>
<H3><A name="posix"></A>POSIX Symbolic Names</H3>
<P>The following symbolic names are recognised as valid collating element names,
in addition to any single character:</P>
<P>
@ -342,15 +349,18 @@
</TR>
</TABLE>
</P>
<P>
<H3><A name="unicode"></A>Named Unicode Characters</H3>
<P>When using <A href="icu_strings.html">Unicode aware regular expressions</A> (with
the <EM>u32regex </EM>type), all the normal symbolic names for Unicode
characters (those given in Unidata.txt) are recognised.</P>
<P>
<HR>
</P>
<P></P>
<p>Revised
<!--webbot bot="Timestamp" S-Type="EDITED" S-Format="%d %B, %Y" startspan -->
24 Oct 2003
<!--webbot bot="Timestamp" endspan i-checksum="39359" --></p>
<p><i><EFBFBD> Copyright John Maddock&nbsp;1998-
<!--webbot bot="Timestamp" S-Type="EDITED" S-Format="%Y" startspan --> 2003<!--webbot bot="Timestamp" endspan i-checksum="39359" --></i></p>
<p>Revised 12 Jan 2005
<!--webbot bot="Timestamp" S-Type="EDITED" S-Format="%d %B, %Y" startspan --></p>
<p><i><EFBFBD> Copyright John Maddock&nbsp;2004-2005</i></p>
<P><I>Use, modification and distribution are subject to the Boost Software License,
Version 1.0. (See accompanying file <A href="../../../LICENSE_1_0.txt">LICENSE_1_0.txt</A>
or copy at <A href="http://www.boost.org/LICENSE_1_0.txt">http://www.boost.org/LICENSE_1_0.txt</A>)</I></P>

View File

@ -40,7 +40,10 @@
<LI>
Added <A href="mfc_strings.html">MFC/ATL string wrappers</A>.
<LI>
Added <A href="unicode.html">Unicode support; based on ICU</A>.</LI></UL>
Added <A href="unicode.html">Unicode support; based on ICU</A>.
<LI>
Changed newline support to recognise \f as a line separator (all character
types), and \x85 as a line separator for wide characters / Unicode only.</LI></UL>
<P>Boost 1.32.1.</P>
<UL>
<LI>

View File

@ -61,6 +61,166 @@
string sort keys produced by the system; if you need this, and the default
implementation doesn't work on your platform, then you will need to supply a
custom traits class.</P>
<H3>Unicode</H3>
<P>The following comments refer to&nbsp;<A href="http://www.unicode.org/reports/tr18/">Unicode
Technical
<SPAN>Standard
</SPAN>#18: Unicode Regular Expressions</A>&nbsp;version 9.</P>
<P>
<TABLE id="Table3" cellSpacing="1" cellPadding="1" width="100%" border="0">
<TR>
<TD>#</TD>
<TD>Feature</TD>
<TD>Support</TD>
</TR>
<TR>
<TD>1.1</TD>
<TD>Hex Notation</TD>
<TD>Yes: use \x{DDDD} to refer to code point UDDDD.</TD>
</TR>
<TR>
<TD>1.2</TD>
<TD>Character Properties</TD>
<TD>All the names listed under the&nbsp;<A href="http://www.unicode.org/reports/tr18/#Categories">General
Category Property</A> are supported.&nbsp; Script names and Other Names are
not currently supported.</TD>
</TR>
<TR>
<TD>1.3</TD>
<TD><A name="Subtraction_and_Intersection">Subtraction</A> and Intersection</TD>
<TD>
<P>Indirectly support by forward-lookahead:
</P>
<P>(?=[[:X:]])[[:Y:]]</P>
<P>Gives the intersection of character properties X and Y.</P>
<P>(?![[:X:]])[[:Y:]]</P>
<P>Gives everything in Y that is not in X (subtraction).</P>
</TD>
</TR>
<TR>
<TD>1.4</TD>
<TD><A name="Simple_Word_Boundaries">Simple Word Boundaries</A></TD>
<TD>Conforming: non-spacing marks are included in the set of word characters.</TD>
</TR>
<TR>
<TD>1.5</TD>
<TD>Caseless Matching</TD>
<TD>Supported, note that at this level, case transformations are 1:1, many to many
case folding operations are not supported (for example&nbsp;"<22>" to "SS").</TD>
</TR>
<TR>
<TD>1.6</TD>
<TD>Line Boundaries</TD>
<TD>Supported, except that "." matches only one character of "\r\n". Other than
that word boundaries match correctly; including not matching in the middle of a
"\r\n" sequence.</TD>
</TR>
<TR>
<TD>1.7</TD>
<TD>Code Points</TD>
<TD>Supported: provided you use the <A href="icu_string.html">u32* algorithms</A>,
then UTF-8, UTF-16 and UTF-32 are all treated as sequences of 32-bit code
points.</TD>
</TR>
<TR>
<TD>2.1</TD>
<TD>Canonical Equivalence</TD>
<TD>Not supported: it is up to the user of the library to convert all text into
the same canonical form as the regular expression.</TD>
</TR>
<TR>
<TD>2.2</TD>
<TD>Default Grapheme Clusters</TD>
<TD>Not supported.</TD>
</TR>
<TR>
<TD>2.3</TD>
<TD><!--StartFragment -->
<P><A name="Default_Word_Boundaries">Default Word Boundaries</A></P>
</TD>
<TD>Not supported.</TD>
</TR>
<TR>
<TD>2.4</TD>
<TD><!--StartFragment -->
<P><A name="Default_Loose_Matches">Default Loose Matches</A></P>
</TD>
<TD>Not Supported.</TD>
</TR>
<TR>
<TD>2.5</TD>
<TD>Name Properties</TD>
<TD>Supported: the expression "[[:name:]]" or \N{name} matches the named character
"name".</TD>
</TR>
<TR>
<TD>2.6</TD>
<TD>Wildcard properties</TD>
<TD>Not Supported.</TD>
</TR>
<TR>
<TD>3.1</TD>
<TD>Tailored Punctuation.</TD>
<TD>Not Supported.</TD>
</TR>
<TR>
<TD>3.2</TD>
<TD>Tailored Grapheme Clusters</TD>
<TD>Not Supported.</TD>
</TR>
<TR>
<TD>3.3</TD>
<TD>Tailored Word Boundaries.</TD>
<TD>Not Supported.</TD>
</TR>
<TR>
<TD>3.4</TD>
<TD>Tailored Loose Matches</TD>
<TD>Partial support: [[=c=]] matches characters with the same primary equivalence
class as "c".</TD>
</TR>
<TR>
<TD>3.5</TD>
<TD>Tailored Ranges</TD>
<TD>Supported: [a-b] matches any character that collates in the range a to b, when
the expression is constructed with the <A href="syntax_option_type.html">collate</A>
flag set.</TD>
</TR>
<TR>
<TD>3.6</TD>
<TD>Context Matches</TD>
<TD>Not Supported.</TD>
</TR>
<TR>
<TD>3.7</TD>
<TD>Incremental Matches</TD>
<TD>Supported: pass the flag <A href="match_flag_type.html">match_partial</A> to
the regex algorithms.</TD>
</TR>
<TR>
<TD>3.8</TD>
<TD>Unicode Set Sharing</TD>
<TD>Not Supported.</TD>
</TR>
<TR>
<TD>3.9</TD>
<TD>Possible Match Sets</TD>
<TD>Not supported, however this information is used internally to optimise the
matching of regular expressions, and return quickly if no match is possible.</TD>
</TR>
<TR>
<TD>3.10</TD>
<TD>Folded Matching</TD>
<TD>Partial Support:&nbsp; It is possible to achieve a similar effect by using a
custom regular expression traits class.</TD>
</TR>
<TR>
<TD>3.11</TD>
<TD>Custom&nbsp;Submatch Evaluation</TD>
<TD>Not Supported.</TD>
</TR>
</TABLE>
</P>
<HR>
<p>Revised&nbsp;
<!--webbot bot="Timestamp" S-Type="EDITED" S-Format="%d %B, %Y" startspan -->
@ -73,3 +233,4 @@
or copy at <A href="http://www.boost.org/LICENSE_1_0.txt">http://www.boost.org/LICENSE_1_0.txt</A>)</I></P>
</body>
</html>

View File

@ -155,7 +155,7 @@ aaaa</PRE>
<P>matches a NUL character.</P>
<H5>Equivalence classes:</H5>
<P>
An expression of the form[[=col=]], matches any character or collating element
An expression of theform[[=col=]], matches any character or collating element
whose primary sort key is the same as that for collating element <EM>col</EM>,
as with colating elements the name <EM>col</EM> may be a <A href="collating_names.html">
symbolic name</A>.&nbsp; A primary sort key is one that ignores case,
@ -242,6 +242,12 @@ aaaa</PRE>
<TD>An octal escape sequence - matches the single character whose code point is
0ddd.</TD>
</TR>
<TR>
<TD>\N{Name}</TD>
<TD>Matches the single character which has the <A href="collating_names.html">symbolic
name</A> <EM>name.&nbsp; </EM>For example \N{newline} matches the single
character \n.</TD>
</TR>
</TABLE>
</P>
<H5>"Single character" character&nbsp;classes:</H5>
@ -298,6 +304,41 @@ aaaa</PRE>
</TR>
</TABLE>
</P>
<H5>
<H5>Character Properties</H5>
</H5>
<P dir="ltr">The character property names in the following table are all
equivalent to the <A href="character_class_names.html">names used in character
classes</A>.</P>
<H5>
<TABLE id="Table9" cellSpacing="1" cellPadding="1" width="100%" border="0">
<TR>
<TD><STRONG>Form</STRONG></TD>
<TD><STRONG>Description</STRONG></TD>
<TD><STRONG>Equivalent character set form</STRONG></TD>
</TR>
<TR>
<TD>\pX</TD>
<TD>Matches any character that has the property X.</TD>
<TD>[[:X:]]</TD>
</TR>
<TR>
<TD>\p{Name}</TD>
<TD>Matches any character that has the property <EM>Name</EM>.</TD>
<TD>[[:Name:]]</TD>
</TR>
<TR>
<TD>\PX</TD>
<TD>Matches any character that does not have the property X.</TD>
<TD>[^[:X:]]</TD>
</TR>
<TR>
<TD>\P{Name}</TD>
<TD>Matches any character that does not have the property <EM>Name</EM>.</TD>
<TD>[^[:Name:]]</TD>
</TR>
</TABLE>
</H5>
<H5>Word Boundaries</H5>
<P>The following escape sequences match the boundaries of words:</P>
<P>

View File

@ -169,7 +169,7 @@ aaaa</PRE>
<P>matches a NUL character.</P>
<H5>Equivalence classes:</H5>
<P>
An expression of the form[[=col=]], matches any character or collating element
An expression of theform[[=col=]], matches any character or collating element
whose primary sort key is the same as that for collating element <EM>col</EM>,
as with colating elements the name <EM>col</EM> may be a <A href="collating_names.html">
symbolic name</A>.&nbsp; A primary sort key is one that ignores case,
@ -250,6 +250,12 @@ aaaa</PRE>
<TD>An octal escape sequence - matches the single character whose code point is
0ddd.</TD>
</TR>
<TR>
<TD>\N{name}</TD>
<TD>Matches the single character which has the <A href="collating_names.html">symbolic
name</A> <EM>name.&nbsp; </EM>For example \N{newline} matches the single
character \n.</TD>
</TR>
</TABLE>
</P>
<H5>"Single character" character&nbsp;classes:</H5>
@ -306,6 +312,38 @@ aaaa</PRE>
</TR>
</TABLE>
</P>
<H5>Character Properties</H5>
<P>The character property names in the following table are all equivalent to the <A href="character_class_names.html">
names used in character classes</A>.</P>
<P>
<TABLE id="Table9" cellSpacing="1" cellPadding="1" width="100%" border="0">
<TR>
<TD><STRONG>Form</STRONG></TD>
<TD><STRONG>Description</STRONG></TD>
<TD><STRONG>Equivalent character set form</STRONG></TD>
</TR>
<TR>
<TD>\pX</TD>
<TD>Matches any character that has the property X.</TD>
<TD>[[:X:]]</TD>
</TR>
<TR>
<TD>\p{Name}</TD>
<TD>Matches any character that has the property <EM>Name</EM>.</TD>
<TD>[[:Name:]]</TD>
</TR>
<TR>
<TD>\PX</TD>
<TD>Matches any character that does not have the property X.</TD>
<TD>[^[:X:]]</TD>
</TR>
<TR>
<TD>\P{Name}</TD>
<TD>Matches any character that does not have the property <EM>Name</EM>.</TD>
<TD>[^[:Name:]]</TD>
</TR>
</TABLE>
</P>
<H5>Word Boundaries</H5>
<P>The following escape sequences match the boundaries of words:</P>
<P>

View File

@ -842,6 +842,8 @@ void basic_regex_creator<charT, traits>::create_startmap(re_syntax_base* state,
l_map[0] |= mask_init;
l_map['\n'] |= mask;
l_map['\r'] |= mask;
l_map['\f'] |= mask;
l_map[0x85] |= mask;
}
// now figure out if we can match a NULL string at this point:
if(pnull)

View File

@ -2012,7 +2012,7 @@ bool basic_regex_parser<charT, traits>::unwind_alts(std::ptrdiff_t last_paren_st
m_alt_jumps.pop_back();
this->m_pdata->m_data.align();
re_jump* jmp = static_cast<re_jump*>(this->getaddress(jump_offset));
BOOST_ASSERT(jmp->type = syntax_element_jump);
BOOST_ASSERT(jmp->type == syntax_element_jump);
jmp->alt.i = this->m_pdata->m_data.size() - jump_offset;
}
return true;

View File

@ -105,12 +105,18 @@ inline bool is_combining<wchar_t>(wchar_t c)
template <class charT>
inline bool is_separator(charT c)
{
return BOOST_REGEX_MAKE_BOOL((c == static_cast<charT>('\n')) || (c == static_cast<charT>('\r')) || (static_cast<int>(c) == 0x2028) || (static_cast<int>(c) == 0x2029));
return BOOST_REGEX_MAKE_BOOL(
(c == static_cast<charT>('\n'))
|| (c == static_cast<charT>('\r'))
|| (c == static_cast<charT>('\f'))
|| (static_cast<boost::uint16_t>(c) == 0x2028u)
|| (static_cast<boost::uint16_t>(c) == 0x2029u)
|| (static_cast<boost::uint16_t>(c) == 0x85u));
}
template <>
inline bool is_separator<char>(char c)
{
return BOOST_REGEX_MAKE_BOOL((c == '\n') || (c == '\r'));
return BOOST_REGEX_MAKE_BOOL((c == '\n') || (c == '\r') || (c == '\f'));
}
//

View File

@ -347,8 +347,8 @@ icu_regex_traits::char_class_type icu_regex_traits::lookup_classname(const char_
U_GC_LU_MASK,
mask_unicode,
U_GC_LU_MASK,
char_class_type(U_GC_L_MASK | U_GC_ND_MASK) | mask_underscore,
char_class_type(U_GC_L_MASK | U_GC_ND_MASK) | mask_underscore,
char_class_type(U_GC_L_MASK | U_GC_ND_MASK | U_GC_MN_MASK) | mask_underscore,
char_class_type(U_GC_L_MASK | U_GC_ND_MASK | U_GC_MN_MASK) | mask_underscore,
char_class_type(U_GC_ND_MASK) | mask_xdigit,
};

View File

@ -86,6 +86,64 @@ void compare_result(const MR1& w1, const MR2& w2, boost::mpl::int_<1> const*)
}
}
void test_icu_grep(const boost::u32regex& r, const std::vector< ::UChar32>& search_text)
{
typedef std::vector< ::UChar32>::const_iterator const_iterator;
typedef boost::u32regex_iterator<const_iterator> test_iterator;
boost::regex_constants::match_flag_type opts = test_info<wchar_t>::match_options();
const int* answer_table = test_info<wchar_t>::answer_table();
test_iterator start(search_text.begin(), search_text.end(), r, opts), end;
test_iterator copy(start);
const_iterator last_end = search_text.begin();
while(start != end)
{
if(start != copy)
{
BOOST_REGEX_TEST_ERROR("Failed iterator != comparison.", wchar_t);
}
if(!(start == copy))
{
BOOST_REGEX_TEST_ERROR("Failed iterator == comparison.", wchar_t);
}
test_result(*start, search_text.begin(), answer_table);
// test $` and $' :
if(start->prefix().first != last_end)
{
BOOST_REGEX_TEST_ERROR("Incorrect position for start of $`", wchar_t);
}
if(start->prefix().second != (*start)[0].first)
{
BOOST_REGEX_TEST_ERROR("Incorrect position for end of $`", wchar_t);
}
if(start->prefix().matched != (start->prefix().first != start->prefix().second))
{
BOOST_REGEX_TEST_ERROR("Incorrect position for matched member of $`", wchar_t);
}
if(start->suffix().first != (*start)[0].second)
{
BOOST_REGEX_TEST_ERROR("Incorrect position for start of $'", wchar_t);
}
if(start->suffix().second != search_text.end())
{
BOOST_REGEX_TEST_ERROR("Incorrect position for end of $'", wchar_t);
}
if(start->suffix().matched != (start->suffix().first != start->suffix().second))
{
BOOST_REGEX_TEST_ERROR("Incorrect position for matched member of $'", wchar_t);
}
last_end = (*start)[0].second;
++start;
++copy;
// move on the answer table to next set of answers;
if(*answer_table != -2)
while(*answer_table++ != -2){}
}
if(answer_table[0] >= 0)
{
// we should have had a match but didn't:
BOOST_REGEX_TEST_ERROR("Expected match was not found.", wchar_t);
}
}
void test_icu(const wchar_t&, const test_regex_search_tag& )
{
@ -204,6 +262,10 @@ void test_icu(const wchar_t&, const test_regex_search_tag& )
}
}
}
//
// finally try a grep:
//
test_icu_grep(r, search_text);
}
catch(const boost::bad_expression& e)
{

View File

@ -305,5 +305,65 @@ void test_sets2()
TEST_INVALID_REGEX("\\N{}", perl);
TEST_INVALID_REGEX("\\N{invalid-name}", perl);
TEST_INVALID_REGEX("\\N{zero", perl);
// and repeat with POSIX-extended syntax:
TEST_REGEX_SEARCH("\\pl+", extended, "ABabcAB", match_default, make_array(2, 5, -2, -2));
TEST_REGEX_SEARCH("\\Pl+", extended, "abABCab", match_default, make_array(2, 5, -2, -2));
TEST_REGEX_SEARCH("\\pu+", extended, "abABCab", match_default, make_array(2, 5, -2, -2));
TEST_REGEX_SEARCH("\\Pu+", extended, "ABabcAB", match_default, make_array(2, 5, -2, -2));
TEST_REGEX_SEARCH("\\pd+", extended, "AB012AB", match_default, make_array(2, 5, -2, -2));
TEST_REGEX_SEARCH("\\PD+", extended, "01abc01", match_default, make_array(2, 5, -2, -2));
TEST_REGEX_SEARCH("\\ps+", extended, "AB AB", match_default, make_array(2, 5, -2, -2));
TEST_REGEX_SEARCH("\\PS+", extended, " abc ", match_default, make_array(2, 5, -2, -2));
TEST_REGEX_SEARCH("\\p{alnum}+", extended, "-%@a0X_-", match_default, make_array(3, 6, -2, -2));
TEST_REGEX_SEARCH("\\p{alpha}+", extended, " -%@aX_0-", match_default, make_array(4, 6, -2, -2));
TEST_REGEX_SEARCH("\\p{blank}+", extended, "a \tb", match_default, make_array(1, 4, -2, -2));
TEST_REGEX_SEARCH("\\p{cntrl}+", extended, " a\n\tb", match_default, make_array(2, 4, -2, -2));
TEST_REGEX_SEARCH("\\p{digit}+", extended, "a019b", match_default, make_array(1, 4, -2, -2));
TEST_REGEX_SEARCH("\\p{graph}+", extended, " a%b ", match_default, make_array(1, 4, -2, -2));
TEST_REGEX_SEARCH("\\p{lower}+", extended, "AabC", match_default, make_array(1, 3, -2, -2));
TEST_REGEX_SEARCH("\\p{print}+", extended, "AabC", match_default, make_array(0, 4, -2, -2));
TEST_REGEX_SEARCH("\\p{punct}+", extended, " %-&\t", match_default, make_array(1, 4, -2, -2));
TEST_REGEX_SEARCH("\\p{space}+", extended, "a \n\t\rb", match_default, make_array(1, 5, -2, -2));
TEST_REGEX_SEARCH("\\p{upper}+", extended, "aBCd", match_default, make_array(1, 3, -2, -2));
TEST_REGEX_SEARCH("\\p{xdigit}+", extended, "p0f3Cx", match_default, make_array(1, 5, -2, -2));
TEST_REGEX_SEARCH("\\P{alnum}+", extended, "-%@a", match_default, make_array(0, 3, -2, -2));
TEST_REGEX_SEARCH("\\P{alpha}+", extended, " -%@a", match_default, make_array(0, 4, -2, -2));
TEST_REGEX_SEARCH("\\P{blank}+", extended, "a ", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("\\P{cntrl}+", extended, " a\n", match_default, make_array(0, 2, -2, -2));
TEST_REGEX_SEARCH("\\P{digit}+", extended, "a0", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("\\P{graph}+", extended, " a", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("\\P{lower}+", extended, "Aa", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("\\P{print}+", extended, "Absc", match_default, make_array(-2, -2));
TEST_REGEX_SEARCH("\\P{punct}+", extended, " %", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("\\P{space}+", extended, "a ", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("\\P{upper}+", extended, "aB", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("\\P{xdigit}+", extended, "pf", match_default, make_array(0, 1, -2, -2));
TEST_INVALID_REGEX("\\p{invalid class}", extended);
TEST_INVALID_REGEX("\\p{upper", extended);
TEST_INVALID_REGEX("\\p{", extended);
TEST_INVALID_REGEX("\\p", extended);
TEST_INVALID_REGEX("\\P{invalid class}", extended);
TEST_INVALID_REGEX("\\P{upper", extended);
TEST_INVALID_REGEX("\\P{", extended);
TEST_INVALID_REGEX("\\P", extended);
// try named characters:
TEST_REGEX_SEARCH("\\N{zero}", extended, "0", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("\\N{one}", extended, "1", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("\\N{two}", extended, "2", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("\\N{three}", extended, "3", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("\\N{a}", extended, "bac", match_default, make_array(1, 2, -2, -2));
TEST_REGEX_SEARCH("\\N{\xf0}", extended, "b\xf0x", match_default, make_array(1, 2, -2, -2));
TEST_REGEX_SEARCH("\\N{right-curly-bracket}", extended, "}", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("\\N{NUL}", extended, "\0", match_default, make_array(0, 1, -2, -2));
TEST_INVALID_REGEX("\\N", extended);
TEST_INVALID_REGEX("\\N{", extended);
TEST_INVALID_REGEX("\\N{}", extended);
TEST_INVALID_REGEX("\\N{invalid-name}", extended);
TEST_INVALID_REGEX("\\N{zero", extended);
}

View File

@ -147,6 +147,14 @@ void test_unicode()
TEST_REGEX_SEARCH_U(L"[\\N{MODIFIER LETTER LOW ACUTE ACCENT}]", perl, L"\x02CF", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH_U(L"[\\N{SUPERSCRIPT ONE}]", perl, L"\x00B9", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH_U(L"\\N{CJK UNIFIED IDEOGRAPH-7FED}", perl, L"\x7FED", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH_U(L"\\w+", perl, L" e\x301" L"coute ", match_default, make_array(1, 8, -2, -2));
TEST_REGEX_SEARCH_U(L"^", perl, L" \x2028 \x2029 \x000D\x000A \x000A \x000C \x000D \x0085 ",
match_default | match_not_bol, make_array(2, 2, -2, 4, 4, -2, 7, 7, -2, 9, 9, -2, 11, 11, -2, 13, 13, -2, 15, 15, -2, -2));
TEST_REGEX_SEARCH_U(L"$", perl, L" \x2028 \x2029 \x000D\x000A \x000A \x000C \x000D \x0085 ",
match_default | match_not_eol, make_array(1, 1, -2, 3, 3, -2, 5, 5, -2, 8, 8, -2, 10, 10, -2, 12, 12, -2, 14, 14, -2, -2));
TEST_REGEX_SEARCH_U(L".", perl, L" \x2028\x2029\x000D\x000A\x000A\x000C\x000D\x0085 ",
match_default | match_not_dot_newline, make_array(0, 1, -2, 9, 10, -2, -2));
}
#else

View File

@ -18,6 +18,7 @@
*/
#include <map>
#include <vector>
#include <string>
#include <iostream>
#include <algorithm>
@ -27,9 +28,16 @@
std::string g_char_type;
std::string g_data_type;
std::map<std::string, std::string> g_table;
std::map<std::string, std::pair<std::string, std::string> > g_help_table;
void add(std::string key, std::string data)
{
g_table[key] = data;
if(key.size() <= 2)
g_help_table[data].first = key;
else
g_help_table[data].second = key;
std::string::size_type i = 0;
while(i < key.size())
{
@ -41,7 +49,6 @@ void add(std::string key, std::string data)
++i;
}
}
g_table[key] = data;
}
#define ADD(x, y) add(BOOST_STRINGIZE(x), BOOST_STRINGIZE(y))
@ -88,6 +95,33 @@ void generate_code()
g_table.clear();
}
void generate_html()
{
// start by producing a sorted list:
std::vector<std::pair<std::string, std::string> > v;
std::map<std::string, std::pair<std::string, std::string> >::const_iterator i, j;
i = g_help_table.begin();
j = g_help_table.end();
while(i != j)
{
v.push_back(i->second);
++i;
}
std::sort(v.begin(), v.end());
std::vector<std::pair<std::string, std::string> >::const_iterator h, k;
h = v.begin();
k = v.end();
std::cout << "<table width=\"100%\"><tr><td><b>Short Name</b></td><td><b>Long Name</b></td></tr>\n";
while(h != k)
{
std::cout << "<tr><td>" << (h->first.size() ? h->first : std::string(" ")) << "</td><td>" << h->second << "</td></tr>\n";
++h;
}
std::cout << "</table>\n\n";
}
int main()
{
g_char_type = "::UChar32";
@ -178,5 +212,6 @@ int main()
ADD(Titlecase, U_GC_LT_MASK);
generate_code();
generate_html();
return 0;
}