Added possessive modifiers ++ *+ ?+ {}+.

Added support for \v and \h as character classes as per Perl-5.10. 

[SVN r52558]
This commit is contained in:
John Maddock
2009-04-23 09:51:31 +00:00
parent ccf465daac
commit 7b10b5dac5
96 changed files with 521 additions and 286 deletions

View File

@ -184,7 +184,9 @@ private:
offset_underscore = U_CHAR_CATEGORY_COUNT+3,
offset_unicode = U_CHAR_CATEGORY_COUNT+4,
offset_any = U_CHAR_CATEGORY_COUNT+5,
offset_ascii = U_CHAR_CATEGORY_COUNT+6
offset_ascii = U_CHAR_CATEGORY_COUNT+6,
offset_horizontal = U_CHAR_CATEGORY_COUNT+7,
offset_vertical = U_CHAR_CATEGORY_COUNT+8
};
//
@ -197,6 +199,8 @@ private:
static const char_class_type mask_unicode;
static const char_class_type mask_any;
static const char_class_type mask_ascii;
static const char_class_type mask_horizontal;
static const char_class_type mask_vertical;
static char_class_type lookup_icu_mask(const ::UChar32* p1, const ::UChar32* p2);

View File

@ -610,6 +610,7 @@ bool basic_regex_parser<charT, traits>::parse_extended_escape()
// fall through:
case regex_constants::escape_type_class:
{
escape_type_class_jump:
typedef typename traits::char_class_type mask_type;
mask_type m = this->m_traits.lookup_classname(m_position, m_position+1);
if(m != 0)
@ -720,6 +721,10 @@ bool basic_regex_parser<charT, traits>::parse_extended_escape()
}
fail(regex_constants::error_ctype, m_position - m_base);
}
case regex_constants::escape_type_control_v:
if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
goto escape_type_class_jump;
// fallthrough:
default:
this->append_literal(unescape_character());
break;
@ -747,6 +752,7 @@ template <class charT, class traits>
bool basic_regex_parser<charT, traits>::parse_repeat(std::size_t low, std::size_t high)
{
bool greedy = true;
bool pocessive = false;
std::size_t insert_point;
//
// when we get to here we may have a non-greedy ? mark still to come:
@ -758,12 +764,19 @@ bool basic_regex_parser<charT, traits>::parse_repeat(std::size_t low, std::size_
)
)
{
// OK we have a perl regex, check for a '?':
// OK we have a perl or emacs regex, check for a '?':
if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question)
{
greedy = false;
++m_position;
}
// for perl regexes only check for pocessive ++ repeats.
if((0 == (this->flags() & regbase::main_option_type))
&& (this->m_traits.syntax_type(*m_position) == regex_constants::syntax_plus))
{
pocessive = true;
++m_position;
}
}
if(0 == this->m_last_state)
{
@ -832,6 +845,20 @@ bool basic_regex_parser<charT, traits>::parse_repeat(std::size_t low, std::size_
// now fill in the alt jump for the repeat:
rep = static_cast<re_repeat*>(this->getaddress(rep_off));
rep->alt.i = this->m_pdata->m_data.size() - rep_off;
//
// If the repeat is pocessive then bracket the repeat with a (?>...)
// independent sub-expression construct:
//
if(pocessive)
{
re_brace* pb = static_cast<re_brace*>(this->insert_state(insert_point, syntax_element_startmark, sizeof(re_brace)));
pb->index = -3;
re_jump* jmp = static_cast<re_jump*>(this->insert_state(insert_point + sizeof(re_brace), syntax_element_jump, sizeof(re_jump)));
this->m_pdata->m_data.align();
jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp);
pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
pb->index = -3;
}
return true;
}

View File

@ -394,7 +394,9 @@ enum
char_class_graph=char_class_alnum|char_class_punct,
char_class_blank=1<<9,
char_class_word=1<<10,
char_class_unicode=1<<11
char_class_unicode=1<<11,
char_class_horizontal_space=1<<12,
char_class_vertical_space=1<<13
};
#endif
@ -413,6 +415,8 @@ public:
BOOST_STATIC_CONSTANT(char_class_type, mask_blank = 1u << 24);
BOOST_STATIC_CONSTANT(char_class_type, mask_word = 1u << 25);
BOOST_STATIC_CONSTANT(char_class_type, mask_unicode = 1u << 26);
BOOST_STATIC_CONSTANT(char_class_type, mask_horizontal = 1u << 27);
BOOST_STATIC_CONSTANT(char_class_type, mask_vertical = 1u << 28);
#endif
typedef std::basic_string<charT> string_type;
@ -477,6 +481,10 @@ template <class charT>
typename cpp_regex_traits_implementation<charT>::char_class_type const cpp_regex_traits_implementation<charT>::mask_word;
template <class charT>
typename cpp_regex_traits_implementation<charT>::char_class_type const cpp_regex_traits_implementation<charT>::mask_unicode;
template <class charT>
typename cpp_regex_traits_implementation<charT>::char_class_type const cpp_regex_traits_implementation<charT>::mask_vertical;
template <class charT>
typename cpp_regex_traits_implementation<charT>::char_class_type const cpp_regex_traits_implementation<charT>::mask_horizontal;
#endif
#endif
@ -688,18 +696,20 @@ void cpp_regex_traits_implementation<charT>::init()
// Custom class names:
//
#ifndef BOOST_REGEX_BUGGY_CTYPE_FACET
static const char_class_type masks[14] =
static const char_class_type masks[16] =
{
std::ctype<charT>::alnum,
std::ctype<charT>::alpha,
std::ctype<charT>::cntrl,
std::ctype<charT>::digit,
std::ctype<charT>::graph,
cpp_regex_traits_implementation<charT>::mask_horizontal,
std::ctype<charT>::lower,
std::ctype<charT>::print,
std::ctype<charT>::punct,
std::ctype<charT>::space,
std::ctype<charT>::upper,
cpp_regex_traits_implementation<charT>::mask_vertical,
std::ctype<charT>::xdigit,
cpp_regex_traits_implementation<charT>::mask_blank,
cpp_regex_traits_implementation<charT>::mask_word,
@ -713,11 +723,13 @@ void cpp_regex_traits_implementation<charT>::init()
::boost::re_detail::char_class_cntrl,
::boost::re_detail::char_class_digit,
::boost::re_detail::char_class_graph,
::boost::re_detail::char_class_horizontal_space,
::boost::re_detail::char_class_lower,
::boost::re_detail::char_class_print,
::boost::re_detail::char_class_punct,
::boost::re_detail::char_class_space,
::boost::re_detail::char_class_upper,
::boost::re_detail::char_class_vertical_space,
::boost::re_detail::char_class_xdigit,
::boost::re_detail::char_class_blank,
::boost::re_detail::char_class_word,
@ -744,7 +756,7 @@ typename cpp_regex_traits_implementation<charT>::char_class_type
cpp_regex_traits_implementation<charT>::lookup_classname_imp(const charT* p1, const charT* p2) const
{
#ifndef BOOST_REGEX_BUGGY_CTYPE_FACET
static const char_class_type masks[20] =
static const char_class_type masks[22] =
{
0,
std::ctype<char>::alnum,
@ -754,6 +766,7 @@ typename cpp_regex_traits_implementation<charT>::char_class_type
std::ctype<char>::digit,
std::ctype<char>::digit,
std::ctype<char>::graph,
cpp_regex_traits_implementation<charT>::mask_horizontal,
std::ctype<char>::lower,
std::ctype<char>::lower,
std::ctype<char>::print,
@ -763,12 +776,13 @@ typename cpp_regex_traits_implementation<charT>::char_class_type
std::ctype<char>::upper,
cpp_regex_traits_implementation<charT>::mask_unicode,
std::ctype<char>::upper,
cpp_regex_traits_implementation<charT>::mask_vertical,
std::ctype<char>::alnum | cpp_regex_traits_implementation<charT>::mask_word,
std::ctype<char>::alnum | cpp_regex_traits_implementation<charT>::mask_word,
std::ctype<char>::xdigit,
};
#else
static const char_class_type masks[20] =
static const char_class_type masks[22] =
{
0,
::boost::re_detail::char_class_alnum,
@ -778,6 +792,7 @@ typename cpp_regex_traits_implementation<charT>::char_class_type
::boost::re_detail::char_class_digit,
::boost::re_detail::char_class_digit,
::boost::re_detail::char_class_graph,
::boost::re_detail::char_class_horizontal_space,
::boost::re_detail::char_class_lower,
::boost::re_detail::char_class_lower,
::boost::re_detail::char_class_print,
@ -787,6 +802,7 @@ typename cpp_regex_traits_implementation<charT>::char_class_type
::boost::re_detail::char_class_upper,
::boost::re_detail::char_class_unicode,
::boost::re_detail::char_class_upper,
::boost::re_detail::char_class_vertical_space,
::boost::re_detail::char_class_alnum | ::boost::re_detail::char_class_word,
::boost::re_detail::char_class_alnum | ::boost::re_detail::char_class_word,
::boost::re_detail::char_class_xdigit,
@ -820,7 +836,9 @@ bool cpp_regex_traits_implementation<charT>::isctype(const charT c, char_class_t
|| ((mask & ::boost::re_detail::char_class_xdigit) && (m_pctype->is(std::ctype<charT>::xdigit, c)))
|| ((mask & ::boost::re_detail::char_class_blank) && (m_pctype->is(std::ctype<charT>::space, c)) && !::boost::re_detail::is_separator(c))
|| ((mask & ::boost::re_detail::char_class_word) && (c == '_'))
|| ((mask & ::boost::re_detail::char_class_unicode) && ::boost::re_detail::is_extended(c));
|| ((mask & ::boost::re_detail::char_class_unicode) && ::boost::re_detail::is_extended(c))
|| ((mask & ::boost::re_detail::char_class_vertical) && (is_separator(c) || (c == '\v')))
|| ((mask & ::boost::re_detail::char_class_horizontal) && m_pctype->is(std::ctype<charT>::space, c) && !(is_separator(c) || (c == '\v')));
}
#endif
@ -930,6 +948,12 @@ public:
&& m_pimpl->m_pctype->is(std::ctype<charT>::space, c)
&& !re_detail::is_separator(c))
return true;
else if((f & re_detail::cpp_regex_traits_implementation<charT>::mask_vertical)
&& (::boost::re_detail::is_separator(c) || (c == '\v')))
return true;
else if((f & re_detail::cpp_regex_traits_implementation<charT>::mask_horizontal)
&& this->isctype(c, std::ctype<charT>::space) && !this->isctype(c, re_detail::cpp_regex_traits_implementation<charT>::mask_vertical))
return true;
return false;
#else
return m_pimpl->isctype(c, f);

View File

@ -159,7 +159,7 @@ struct character_pointer_range
template <class charT>
int get_default_class_id(const charT* p1, const charT* p2)
{
static const charT data[72] = {
static const charT data[73] = {
'a', 'l', 'n', 'u', 'm',
'a', 'l', 'p', 'h', 'a',
'b', 'l', 'a', 'n', 'k',
@ -172,11 +172,12 @@ int get_default_class_id(const charT* p1, const charT* p2)
's', 'p', 'a', 'c', 'e',
'u', 'n', 'i', 'c', 'o', 'd', 'e',
'u', 'p', 'p', 'e', 'r',
'v',
'w', 'o', 'r', 'd',
'x', 'd', 'i', 'g', 'i', 't',
};
static const character_pointer_range<charT> ranges[19] =
static const character_pointer_range<charT> ranges[21] =
{
{data+0, data+5,}, // alnum
{data+5, data+10,}, // alpha
@ -185,6 +186,7 @@ int get_default_class_id(const charT* p1, const charT* p2)
{data+20, data+21,}, // d
{data+20, data+25,}, // digit
{data+25, data+30,}, // graph
{data+29, data+30,}, // h
{data+30, data+31,}, // l
{data+30, data+35,}, // lower
{data+35, data+40,}, // print
@ -194,9 +196,10 @@ int get_default_class_id(const charT* p1, const charT* p2)
{data+57, data+58,}, // u
{data+50, data+57,}, // unicode
{data+57, data+62,}, // upper
{data+62, data+63,}, // w
{data+62, data+66,}, // word
{data+66, data+72,}, // xdigit
{data+62, data+63,}, // v
{data+63, data+64,}, // w
{data+63, data+67,}, // word
{data+67, data+73,}, // xdigit
};
static const character_pointer_range<charT>* ranges_begin = ranges;
static const character_pointer_range<charT>* ranges_end = ranges + (sizeof(ranges)/sizeof(ranges[0]));

View File

@ -294,6 +294,8 @@ public:
typedef typename w32_regex_traits<charT>::char_class_type char_class_type;
BOOST_STATIC_CONSTANT(char_class_type, mask_word = 0x0400); // must be C1_DEFINED << 1
BOOST_STATIC_CONSTANT(char_class_type, mask_unicode = 0x0800); // must be C1_DEFINED << 2
BOOST_STATIC_CONSTANT(char_class_type, mask_horizontal = 0x1000); // must be C1_DEFINED << 3
BOOST_STATIC_CONSTANT(char_class_type, mask_vertical = 0x2000); // must be C1_DEFINED << 4
BOOST_STATIC_CONSTANT(char_class_type, mask_base = 0x3ff); // all the masks used by the CT_CTYPE1 group
typedef std::basic_string<charT> string_type;
@ -510,7 +512,7 @@ template <class charT>
typename w32_regex_traits_implementation<charT>::char_class_type
w32_regex_traits_implementation<charT>::lookup_classname_imp(const charT* p1, const charT* p2) const
{
static const char_class_type masks[20] =
static const char_class_type masks[22] =
{
0,
0x0104u, // C1_ALPHA | C1_DIGIT
@ -520,6 +522,7 @@ typename w32_regex_traits_implementation<charT>::char_class_type
0x0004u, // C1_DIGIT
0x0004u, // C1_DIGIT
(~(0x0020u|0x0008u|0x0040) & 0x01ffu) | 0x0400u, // not C1_CNTRL or C1_SPACE or C1_BLANK
w32_regex_traits_implementation<charT>::mask_horizontal,
0x0002u, // C1_LOWER
0x0002u, // C1_LOWER
(~0x0020u & 0x01ffu) | 0x0400, // not C1_CNTRL
@ -529,6 +532,7 @@ typename w32_regex_traits_implementation<charT>::char_class_type
0x0001u, // C1_UPPER
w32_regex_traits_implementation<charT>::mask_unicode,
0x0001u, // C1_UPPER
w32_regex_traits_implementation<charT>::mask_vertical,
0x0104u | w32_regex_traits_implementation<charT>::mask_word,
0x0104u | w32_regex_traits_implementation<charT>::mask_word,
0x0080u, // C1_XDIGIT
@ -628,6 +632,12 @@ public:
return true;
else if((f & re_detail::w32_regex_traits_implementation<charT>::mask_word) && (c == '_'))
return true;
else if((f & re_detail::w32_regex_traits_implementation<charT>::mask_vertical)
&& (::boost::re_detail::is_separator(c) || (c == '\v')))
return true;
else if((f & re_detail::w32_regex_traits_implementation<charT>::mask_horizontal)
&& this->isctype(c, 0x0008u) && !this->isctype(c, re_detail::w32_regex_traits_implementation<charT>::mask_vertical))
return true;
return false;
}
int toi(const charT*& p1, const charT* p2, int radix)const