Add support for named sub-expressions.

[SVN r52823]
This commit is contained in:
John Maddock
2009-05-07 09:46:51 +00:00
parent 30941e330d
commit 55d979060c
49 changed files with 1287 additions and 206 deletions

View File

@ -844,6 +844,42 @@ struct BoostRegexConcept
m_string = m_char + m_sub;
ignore_unused_variable_warning(m_string);
// Named sub-expressions:
m_sub = m_cresults[&m_char];
ignore_unused_variable_warning(m_sub);
m_sub = m_cresults[m_string];
ignore_unused_variable_warning(m_sub);
m_sub = m_cresults[""];
ignore_unused_variable_warning(m_sub);
m_sub = m_cresults[std::string("")];
ignore_unused_variable_warning(m_sub);
m_string = m_cresults.str(&m_char);
ignore_unused_variable_warning(m_string);
m_string = m_cresults.str(m_string);
ignore_unused_variable_warning(m_string);
m_string = m_cresults.str("");
ignore_unused_variable_warning(m_string);
m_string = m_cresults.str(std::string(""));
ignore_unused_variable_warning(m_string);
typename match_results_type::difference_type diff;
diff = m_cresults.length(&m_char);
ignore_unused_variable_warning(diff);
diff = m_cresults.length(m_string);
ignore_unused_variable_warning(diff);
diff = m_cresults.length("");
ignore_unused_variable_warning(diff);
diff = m_cresults.length(std::string(""));
ignore_unused_variable_warning(diff);
diff = m_cresults.position(&m_char);
ignore_unused_variable_warning(diff);
diff = m_cresults.position(m_string);
ignore_unused_variable_warning(diff);
diff = m_cresults.position("");
ignore_unused_variable_warning(diff);
diff = m_cresults.position(std::string(""));
ignore_unused_variable_warning(diff);
#ifndef BOOST_NO_STD_LOCALE
m_stream << m_sub;
m_stream << m_cresults;

View File

@ -19,6 +19,8 @@
#ifndef BOOST_REGEX_V4_BASIC_REGEX_HPP
#define BOOST_REGEX_V4_BASIC_REGEX_HPP
#include <boost/type_traits/is_same.hpp>
#ifdef BOOST_MSVC
#pragma warning(push)
#pragma warning(disable: 4103)
@ -44,12 +46,123 @@ namespace re_detail{
template <class charT, class traits>
class basic_regex_parser;
template <class I>
void bubble_down_one(I first, I last)
{
if(first != last)
{
I next = last - 1;
while((next != first) && !(*(next-1) < *next))
{
(next-1)->swap(*next);
--next;
}
}
}
//
// Class named_subexpressions
// Contains information about named subexpressions within the regex.
//
template <class charT>
class named_subexpressions_base
{
public:
virtual int get_id(const charT* i, const charT* j) = 0;
};
template <class charT>
class named_subexpressions : public named_subexpressions_base<charT>
{
struct name
{
name(const charT* i, const charT* j, int idx)
: n(i, j), index(idx) {}
std::vector<charT> n;
int index;
bool operator < (const name& other)const
{
return std::lexicographical_compare(n.begin(), n.end(), other.n.begin(), other.n.end());
}
bool operator == (const name& other)const
{
return n == other.n;
}
void swap(name& other)
{
n.swap(other.n);
std::swap(index, other.index);
}
};
public:
named_subexpressions(){}
void set_name(const charT* i, const charT* j, int index)
{
m_sub_names.push_back(name(i, j, index));
bubble_down_one(m_sub_names.begin(), m_sub_names.end());
}
int get_id(const charT* i, const charT* j)
{
name t(i, j, 0);
typename std::vector<name>::const_iterator pos = lower_bound(m_sub_names.begin(), m_sub_names.end(), t);
if((pos != m_sub_names.end()) && (*pos == t))
{
return pos->index;
}
return -1;
}
private:
std::vector<name> m_sub_names;
};
template <class charT, class Other>
class named_subexpressions_converter : public named_subexpressions_base<charT>
{
boost::shared_ptr<named_subexpressions<Other> > m_converter;
public:
named_subexpressions_converter(boost::shared_ptr<named_subexpressions<Other> > s)
: m_converter(s) {}
virtual int get_id(const charT* i, const charT* j)
{
if(i == j)
return -1;
std::vector<Other> v;
while(i != j)
{
v.push_back(*i);
++i;
}
return m_converter->get_id(&v[0], &v[0] + v.size());
}
};
template <class To>
inline boost::shared_ptr<named_subexpressions_base<To> > convert_to_named_subs_imp(
boost::shared_ptr<named_subexpressions<To> > s,
boost::integral_constant<bool,true> const&)
{
return s;
}
template <class To, class From>
inline boost::shared_ptr<named_subexpressions_base<To> > convert_to_named_subs_imp(
boost::shared_ptr<named_subexpressions<From> > s,
boost::integral_constant<bool,false> const&)
{
return boost::shared_ptr<named_subexpressions_converter<To, From> >(new named_subexpressions_converter<To, From>(s));
}
template <class To, class From>
inline boost::shared_ptr<named_subexpressions_base<To> > convert_to_named_subs(
boost::shared_ptr<named_subexpressions<From> > s)
{
typedef typename boost::is_same<To, From>::type tag_type;
return convert_to_named_subs_imp<To>(s, tag_type());
}
//
// class regex_data:
// represents the data we wish to expose to the matching algorithms.
//
template <class charT, class traits>
struct regex_data
struct regex_data : public named_subexpressions<charT>
{
typedef regex_constants::syntax_option_type flag_type;
typedef std::size_t size_type;
@ -520,6 +633,10 @@ public:
BOOST_ASSERT(0 != m_pimpl.get());
return m_pimpl->get_data();
}
boost::shared_ptr<re_detail::named_subexpressions<charT> > get_named_subs()const
{
return m_pimpl;
}
private:
shared_ptr<re_detail::basic_regex_implementation<charT, traits> > m_pimpl;

View File

@ -777,6 +777,15 @@ escape_type_class_jump:
}
const charT* pc = m_position;
int i = this->m_traits.toi(pc, m_end, 10);
if(i < 0)
{
// Check for a named capture:
const charT* base = m_position;
while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
++m_position;
i = this->m_pdata->get_id(base, m_position);
pc = m_position;
}
if(negative)
i = 1 + m_mark_count - i;
if((i > 0) && (this->m_backrefs & (1u << (i-1))))
@ -1784,6 +1793,7 @@ bool basic_regex_parser<charT, traits>::parse_perl_extension()
regex_constants::syntax_option_type old_flags = this->flags();
bool old_case_change = m_has_case_change;
m_has_case_change = false;
charT name_delim;
//
// select the actual extension used:
//
@ -1825,8 +1835,10 @@ bool basic_regex_parser<charT, traits>::parse_perl_extension()
pb->index = markid = -1;
else
{
fail(regex_constants::error_badrepeat, m_position - m_base);
return false;
// Probably a named capture which also starts (?< :
name_delim = '>';
--m_position;
goto named_capture_jump;
}
++m_position;
jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
@ -1903,7 +1915,7 @@ bool basic_regex_parser<charT, traits>::parse_perl_extension()
if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal)
&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not))
{
fail(regex_constants::error_badrepeat, m_position - m_base);
fail(regex_constants::error_paren, m_position - m_base);
return false;
}
m_position -= 2;
@ -1914,6 +1926,40 @@ bool basic_regex_parser<charT, traits>::parse_perl_extension()
case regex_constants::syntax_close_mark:
fail(regex_constants::error_badrepeat, m_position - m_base);
return false;
case regex_constants::escape_type_end_buffer:
{
name_delim = *m_position;
named_capture_jump:
markid = 0;
if(0 == (this->flags() & regbase::nosubs))
{
markid = ++m_mark_count;
#ifndef BOOST_NO_STD_DISTANCE
if(this->flags() & regbase::save_subexpression_location)
this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>(std::distance(m_base, m_position) - 2, 0));
#else
if(this->flags() & regbase::save_subexpression_location)
this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>((m_position - m_base) - 2, 0));
#endif
}
pb->index = markid;
const charT* base = ++m_position;
if(m_position == m_end)
{
fail(regex_constants::error_paren, m_position - m_base);
return false;
}
while((m_position != m_end) && (*m_position != name_delim))
++m_position;
if(m_position == m_end)
{
fail(regex_constants::error_paren, m_position - m_base);
return false;
}
this->m_pdata->set_name(base, m_position, markid);
++m_position;
break;
}
default:
//
// lets assume that we have a (?imsx) group and try and parse it:
@ -2043,6 +2089,22 @@ bool basic_regex_parser<charT, traits>::parse_perl_extension()
// and the case change data:
//
m_has_case_change = old_case_change;
if(markid > 0)
{
#ifndef BOOST_NO_STD_DISTANCE
if(this->flags() & regbase::save_subexpression_location)
this->m_pdata->m_subs.at(markid - 1).second = std::distance(m_base, m_position) - 1;
#else
if(this->flags() & regbase::save_subexpression_location)
this->m_pdata->m_subs.at(markid - 1).second = (m_position - m_base) - 1;
#endif
//
// allow backrefs to this mark:
//
if((markid > 0) && (markid < (int)(sizeof(unsigned) * CHAR_BIT)))
this->m_backrefs |= 1u << (markid - 1);
}
return true;
}

View File

@ -36,6 +36,13 @@ namespace boost{
#pragma warning(disable : 4251 4231 4660)
#endif
namespace re_detail{
template <class charT>
class named_subexpressions;
}
template <class BidiIterator, class Allocator>
class match_results
{
@ -62,13 +69,14 @@ public:
typedef typename re_detail::regex_iterator_traits<
BidiIterator>::value_type char_type;
typedef std::basic_string<char_type> string_type;
typedef re_detail::named_subexpressions_base<char_type> named_sub_type;
// construct/copy/destroy:
explicit match_results(const Allocator& a = Allocator())
#ifndef BOOST_NO_STD_ALLOCATOR
: m_subs(a), m_base() {}
: m_subs(a), m_base(), m_last_closed_paren(0) {}
#else
: m_subs(), m_base() { (void)a; }
: m_subs(), m_base(), m_last_closed_paren(0) { (void)a; }
#endif
match_results(const match_results& m)
: m_subs(m.m_subs), m_base(m.m_base) {}
@ -95,6 +103,24 @@ public:
return m_subs[sub].length();
return 0;
}
difference_type length(const char_type* sub) const
{
const char_type* end = sub;
while(*end) ++end;
return length(named_subexpression_index(sub, end));
}
template <class charT>
difference_type length(const charT* sub) const
{
const charT* end = sub;
while(*end) ++end;
return length(named_subexpression_index(sub, end));
}
template <class charT, class Traits, class A>
difference_type length(const std::basic_string<charT, Traits, A>& sub) const
{
return length(sub.c_str());
}
difference_type position(size_type sub = 0) const
{
sub += 2;
@ -108,6 +134,24 @@ public:
}
return ~static_cast<difference_type>(0);
}
difference_type position(const char_type* sub) const
{
const char_type* end = sub;
while(*end) ++end;
return position(named_subexpression_index(sub, end));
}
template <class charT>
difference_type position(const charT* sub) const
{
const charT* end = sub;
while(*end) ++end;
return position(named_subexpression_index(sub, end));
}
template <class charT, class Traits, class A>
difference_type position(const std::basic_string<charT, Traits, A>& sub) const
{
return position(sub.c_str());
}
string_type str(int sub = 0) const
{
sub += 2;
@ -122,6 +166,25 @@ public:
}
return result;
}
string_type str(const char_type* sub) const
{
return (*this)[sub].str();
}
template <class Traits, class A>
string_type str(const std::basic_string<char_type, Traits, A>& sub) const
{
return (*this)[sub].str();
}
template <class charT>
string_type str(const charT* sub) const
{
return (*this)[sub].str();
}
template <class charT, class Traits, class A>
string_type str(const std::basic_string<charT, Traits, A>& sub) const
{
return (*this)[sub].str();
}
const_reference operator[](int sub) const
{
sub += 2;
@ -131,6 +194,75 @@ public:
}
return m_null;
}
//
// Named sub-expressions:
//
const_reference named_subexpression(const char_type* i, const char_type* j) const
{
int index = m_named_subs->get_id(i, j);
return index > 0 ? (*this)[index] : m_null;
}
template <class charT>
const_reference named_subexpression(const charT* i, const charT* j) const
{
BOOST_STATIC_ASSERT(sizeof(charT) <= sizeof(char_type));
if(i == j)
return m_null;
std::vector<char_type> s;
while(i != j)
s.insert(s.end(), *i++);
return named_subexpression(&*s.begin(), &*s.begin() + s.size());
}
int named_subexpression_index(const char_type* i, const char_type* j) const
{
int index = m_named_subs->get_id(i, j);
return index > 0 ? index : -20;
}
template <class charT>
int named_subexpression_index(const charT* i, const charT* j) const
{
BOOST_STATIC_ASSERT(sizeof(charT) <= sizeof(char_type));
if(i == j)
return -20;
std::vector<char_type> s;
while(i != j)
s.insert(s.end(), *i++);
return named_subexpression_index(&*s.begin(), &*s.begin() + s.size());
}
template <class Traits, class A>
const_reference operator[](const std::basic_string<char_type, Traits, A>& s) const
{
return named_subexpression(s.c_str(), s.c_str() + s.size());
}
const_reference operator[](const char_type* p) const
{
const char_type* e = p;
while(*e) ++e;
return named_subexpression(p, e);
}
template <class charT>
const_reference operator[](const charT* p) const
{
BOOST_STATIC_ASSERT(sizeof(charT) <= sizeof(char_type));
if(*p == 0)
return m_null;
std::vector<char_type> s;
while(*p)
s.insert(s.end(), *p++);
return named_subexpression(&*s.begin(), &*s.begin() + s.size());
}
template <class charT, class Traits, class A>
const_reference operator[](const std::basic_string<charT, Traits, A>& ns) const
{
BOOST_STATIC_ASSERT(sizeof(charT) <= sizeof(char_type));
if(ns.empty())
return m_null;
std::vector<char_type> s;
for(unsigned i = 0; i < ns.size(); ++i)
s.insert(s.end(), ns[i]);
return named_subexpression(&*s.begin(), &*s.begin() + s.size());
}
const_reference prefix() const
{
@ -186,6 +318,10 @@ public:
::boost::re_detail::regex_format_imp(i, *this, fmt.data(), fmt.data() + fmt.size(), flags, re.get_traits());
return result;
}
const_reference get_last_closed_paren()const
{
return m_last_closed_paren == 0 ? m_null : (*this)[m_last_closed_paren];
}
allocator_type get_allocator() const
{
@ -232,6 +368,8 @@ public:
void BOOST_REGEX_CALL set_second(BidiIterator i, size_type pos, bool m = true, bool escape_k = false)
{
if(pos)
m_last_closed_paren = pos;
pos += 2;
BOOST_ASSERT(m_subs.size() > pos);
m_subs[pos].second = i;
@ -261,6 +399,7 @@ public:
m_subs.insert(m_subs.end(), n+2-len, v);
}
m_subs[1].first = i;
m_last_closed_paren = 0;
}
void BOOST_REGEX_CALL set_base(BidiIterator pos)
{
@ -301,11 +440,17 @@ public:
}
void BOOST_REGEX_CALL maybe_assign(const match_results<BidiIterator, Allocator>& m);
void BOOST_REGEX_CALL set_named_subs(boost::shared_ptr<named_sub_type> subs)
{
m_named_subs = subs;
}
private:
vector_type m_subs; // subexpressions
BidiIterator m_base; // where the search started from
sub_match<BidiIterator> m_null; // a null match
boost::shared_ptr<named_sub_type> m_named_subs;
int m_last_closed_paren;
};
template <class BidiIterator, class Allocator>

View File

@ -200,6 +200,7 @@ bool perl_matcher<BidiIterator, Allocator, traits>::match_imp()
m_match_flags |= regex_constants::match_all;
m_presult->set_size((m_match_flags & match_nosubs) ? 1 : re.mark_count(), search_base, last);
m_presult->set_base(base);
m_presult->set_named_subs(re_detail::convert_to_named_subs<typename match_results<BidiIterator>::char_type>(this->re.get_named_subs()));
if(m_match_flags & match_posix)
m_result = *m_presult;
verify_options(re.flags(), m_match_flags);
@ -261,6 +262,7 @@ bool perl_matcher<BidiIterator, Allocator, traits>::find_imp()
pstate = re.get_first_state();
m_presult->set_size((m_match_flags & match_nosubs) ? 1 : re.mark_count(), base, last);
m_presult->set_base(base);
m_presult->set_named_subs(re_detail::convert_to_named_subs<typename match_results<BidiIterator>::char_type>(this->re.get_named_subs()));
m_match_flags |= regex_constants::match_init;
}
else

View File

@ -107,6 +107,7 @@ private:
void format_escape();
void format_conditional();
void format_until_scope_end();
bool handle_perl_verb(bool have_brace);
const traits& m_traits; // the traits class for localised formatting operations
const Results& m_results; // the match_results being used.
@ -250,6 +251,25 @@ void basic_regex_formatter<OutputIterator, Results, traits>::format_perl()
case '$':
put(*m_position++);
break;
case '+':
if((++m_position != m_end) && (*m_position == '{'))
{
const char_type* base = ++m_position;
while((m_position != m_end) && (*m_position != '}')) ++m_position;
if(m_position != m_end)
{
// Named sub-expression:
put(this->m_results.named_subexpression(base, m_position));
++m_position;
break;
}
else
{
m_position = --base;
}
}
put((this->m_results)[this->m_results.size() > 1 ? this->m_results.size() - 1 : 1]);
break;
case '{':
have_brace = true;
++m_position;
@ -258,14 +278,18 @@ void basic_regex_formatter<OutputIterator, Results, traits>::format_perl()
// see if we have a number:
{
std::ptrdiff_t len = ::boost::re_detail::distance(m_position, m_end);
len = (std::min)(static_cast<std::ptrdiff_t>(2), len);
//len = (std::min)(static_cast<std::ptrdiff_t>(2), len);
int v = m_traits.toi(m_position, m_position + len, 10);
if((v < 0) || (have_brace && ((m_position == m_end) || (*m_position != '}'))))
{
// leave the $ as is, and carry on:
m_position = --save_position;
put(*m_position);
++m_position;
// Look for a Perl-5.10 verb:
if(!handle_perl_verb(have_brace))
{
// leave the $ as is, and carry on:
m_position = --save_position;
put(*m_position);
++m_position;
}
break;
}
// otherwise output sub v:
@ -276,6 +300,123 @@ void basic_regex_formatter<OutputIterator, Results, traits>::format_perl()
}
}
template <class OutputIterator, class Results, class traits>
bool basic_regex_formatter<OutputIterator, Results, traits>::handle_perl_verb(bool have_brace)
{
//
// We may have a capitalised string containing a Perl action:
//
static const char_type MATCH[] = { 'M', 'A', 'T', 'C', 'H' };
static const char_type PREMATCH[] = { 'P', 'R', 'E', 'M', 'A', 'T', 'C', 'H' };
static const char_type POSTMATCH[] = { 'P', 'O', 'S', 'T', 'M', 'A', 'T', 'C', 'H' };
static const char_type LAST_PAREN_MATCH[] = { 'L', 'A', 'S', 'T', '_', 'P', 'A', 'R', 'E', 'N', '_', 'M', 'A', 'T', 'C', 'H' };
static const char_type LAST_SUBMATCH_RESULT[] = { 'L', 'A', 'S', 'T', '_', 'S', 'U', 'B', 'M', 'A', 'T', 'C', 'H', '_', 'R', 'E', 'S', 'U', 'L', 'T' };
static const char_type LAST_SUBMATCH_RESULT_ALT[] = { '^', 'N' };
if(have_brace && (*m_position == '^'))
++m_position;
int max_len = m_end - m_position;
if((max_len >= 5) && std::equal(m_position, m_position + 5, MATCH))
{
m_position += 5;
if(have_brace)
{
if(*m_position == '}')
++m_position;
else
{
m_position -= 5;
return false;
}
}
put(this->m_results[0]);
return true;
}
if((max_len >= 8) && std::equal(m_position, m_position + 8, PREMATCH))
{
m_position += 8;
if(have_brace)
{
if(*m_position == '}')
++m_position;
else
{
m_position -= 8;
return false;
}
}
put(this->m_results.prefix());
return true;
}
if((max_len >= 9) && std::equal(m_position, m_position + 9, POSTMATCH))
{
m_position += 9;
if(have_brace)
{
if(*m_position == '}')
++m_position;
else
{
m_position -= 9;
return false;
}
}
put(this->m_results.suffix());
return true;
}
if((max_len >= 16) && std::equal(m_position, m_position + 16, LAST_PAREN_MATCH))
{
m_position += 16;
if(have_brace)
{
if(*m_position == '}')
++m_position;
else
{
m_position -= 16;
return false;
}
}
put((this->m_results)[this->m_results.size() > 1 ? this->m_results.size() - 1 : 1]);
return true;
}
if((max_len >= 20) && std::equal(m_position, m_position + 20, LAST_SUBMATCH_RESULT))
{
m_position += 20;
if(have_brace)
{
if(*m_position == '}')
++m_position;
else
{
m_position -= 20;
return false;
}
}
put(this->m_results.get_last_closed_paren());
return true;
}
if((max_len >= 2) && std::equal(m_position, m_position + 2, LAST_SUBMATCH_RESULT_ALT))
{
m_position += 2;
if(have_brace)
{
if(*m_position == '}')
++m_position;
else
{
m_position -= 2;
return false;
}
}
put(this->m_results.get_last_closed_paren());
return true;
}
return false;
}
template <class OutputIterator, class Results, class traits>
void basic_regex_formatter<OutputIterator, Results, traits>::format_escape()
{

View File

@ -326,9 +326,9 @@ inline const charT* get_escape_R_string()
#endif
static const charT e1[] = { '(', '?', '>', '\x0D', '\x0A', '?',
'|', '[', '\x0A', '\x0B', '\x0C', '\x85', '\\', 'x', '{', '2', '0', '2', '8', '}',
'\\', 'x', '{', '2', '0', '2', '9', '}', ']', ')' };
'\\', 'x', '{', '2', '0', '2', '9', '}', ']', ')', '\0' };
static const charT e2[] = { '(', '?', '>', '\x0D', '\x0A', '?',
'|', '[', '\x0A', '\x0B', '\x0C', '\x85', ']', ')' };
'|', '[', '\x0A', '\x0B', '\x0C', '\x85', ']', ')', '\0' };
charT c = static_cast<charT>(0x2029u);
bool b = (static_cast<unsigned>(c) == 0x2029u);