| /* |
| * |
| * Copyright (c) 1998-2002 |
| * John Maddock |
| * |
| * Use, modification and distribution are subject to the |
| * Boost Software License, Version 1.0. (See accompanying file |
| * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
| * |
| */ |
| |
| /* |
| * LOCATION: see http://www.boost.org for most recent version. |
| * FILE states.cpp |
| * VERSION see <ndnboost/version.hpp> |
| * DESCRIPTION: Declares internal state machine structures. |
| */ |
| |
| #ifndef NDNBOOST_REGEX_V4_STATES_HPP |
| #define NDNBOOST_REGEX_V4_STATES_HPP |
| |
| #ifdef NDNBOOST_MSVC |
| #pragma warning(push) |
| #pragma warning(disable: 4103) |
| #endif |
| #ifdef NDNBOOST_HAS_ABI_HEADERS |
| # include NDNBOOST_ABI_PREFIX |
| #endif |
| #ifdef NDNBOOST_MSVC |
| #pragma warning(pop) |
| #endif |
| |
| namespace ndnboost{ |
| namespace re_detail{ |
| |
| /*** mask_type ******************************************************* |
| Whenever we have a choice of two alternatives, we use an array of bytes |
| to indicate which of the two alternatives it is possible to take for any |
| given input character. If mask_take is set, then we can take the next |
| state, and if mask_skip is set then we can take the alternative. |
| ***********************************************************************/ |
| enum mask_type |
| { |
| mask_take = 1, |
| mask_skip = 2, |
| mask_init = 4, |
| mask_any = mask_skip | mask_take, |
| mask_all = mask_any |
| }; |
| |
| /*** helpers ********************************************************** |
| These helpers let us use function overload resolution to detect whether |
| we have narrow or wide character strings: |
| ***********************************************************************/ |
| struct _narrow_type{}; |
| struct _wide_type{}; |
| template <class charT> struct is_byte; |
| template<> struct is_byte<char> { typedef _narrow_type width_type; }; |
| template<> struct is_byte<unsigned char>{ typedef _narrow_type width_type; }; |
| template<> struct is_byte<signed char> { typedef _narrow_type width_type; }; |
| template <class charT> struct is_byte { typedef _wide_type width_type; }; |
| |
| /*** enum syntax_element_type ****************************************** |
| Every record in the state machine falls into one of the following types: |
| ***********************************************************************/ |
| enum syntax_element_type |
| { |
| // start of a marked sub-expression, or perl-style (?...) extension |
| syntax_element_startmark = 0, |
| // end of a marked sub-expression, or perl-style (?...) extension |
| syntax_element_endmark = syntax_element_startmark + 1, |
| // any sequence of literal characters |
| syntax_element_literal = syntax_element_endmark + 1, |
| // start of line assertion: ^ |
| syntax_element_start_line = syntax_element_literal + 1, |
| // end of line assertion $ |
| syntax_element_end_line = syntax_element_start_line + 1, |
| // match any character: . |
| syntax_element_wild = syntax_element_end_line + 1, |
| // end of expression: we have a match when we get here |
| syntax_element_match = syntax_element_wild + 1, |
| // perl style word boundary: \b |
| syntax_element_word_boundary = syntax_element_match + 1, |
| // perl style within word boundary: \B |
| syntax_element_within_word = syntax_element_word_boundary + 1, |
| // start of word assertion: \< |
| syntax_element_word_start = syntax_element_within_word + 1, |
| // end of word assertion: \> |
| syntax_element_word_end = syntax_element_word_start + 1, |
| // start of buffer assertion: \` |
| syntax_element_buffer_start = syntax_element_word_end + 1, |
| // end of buffer assertion: \' |
| syntax_element_buffer_end = syntax_element_buffer_start + 1, |
| // backreference to previously matched sub-expression |
| syntax_element_backref = syntax_element_buffer_end + 1, |
| // either a wide character set [..] or one with multicharacter collating elements: |
| syntax_element_long_set = syntax_element_backref + 1, |
| // narrow character set: [...] |
| syntax_element_set = syntax_element_long_set + 1, |
| // jump to a new state in the machine: |
| syntax_element_jump = syntax_element_set + 1, |
| // choose between two production states: |
| syntax_element_alt = syntax_element_jump + 1, |
| // a repeat |
| syntax_element_rep = syntax_element_alt + 1, |
| // match a combining character sequence |
| syntax_element_combining = syntax_element_rep + 1, |
| // perl style soft buffer end: \z |
| syntax_element_soft_buffer_end = syntax_element_combining + 1, |
| // perl style continuation: \G |
| syntax_element_restart_continue = syntax_element_soft_buffer_end + 1, |
| // single character repeats: |
| syntax_element_dot_rep = syntax_element_restart_continue + 1, |
| syntax_element_char_rep = syntax_element_dot_rep + 1, |
| syntax_element_short_set_rep = syntax_element_char_rep + 1, |
| syntax_element_long_set_rep = syntax_element_short_set_rep + 1, |
| // a backstep for lookbehind repeats: |
| syntax_element_backstep = syntax_element_long_set_rep + 1, |
| // an assertion that a mark was matched: |
| syntax_element_assert_backref = syntax_element_backstep + 1, |
| syntax_element_toggle_case = syntax_element_assert_backref + 1, |
| // a recursive expression: |
| syntax_element_recurse = syntax_element_toggle_case + 1 |
| }; |
| |
| #ifdef NDNBOOST_REGEX_DEBUG |
| // dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion |
| std::ostream& operator<<(std::ostream&, syntax_element_type); |
| #endif |
| |
| struct re_syntax_base; |
| |
| /*** union offset_type ************************************************ |
| Points to another state in the machine. During machine construction |
| we use integral offsets, but these are converted to pointers before |
| execution of the machine. |
| ***********************************************************************/ |
| union offset_type |
| { |
| re_syntax_base* p; |
| std::ptrdiff_t i; |
| }; |
| |
| /*** struct re_syntax_base ******************************************** |
| Base class for all states in the machine. |
| ***********************************************************************/ |
| struct re_syntax_base |
| { |
| syntax_element_type type; // what kind of state this is |
| offset_type next; // next state in the machine |
| }; |
| |
| /*** struct re_brace ************************************************** |
| A marked parenthesis. |
| ***********************************************************************/ |
| struct re_brace : public re_syntax_base |
| { |
| // The index to match, can be zero (don't mark the sub-expression) |
| // or negative (for perl style (?...) extentions): |
| int index; |
| bool icase; |
| }; |
| |
| /*** struct re_dot ************************************************** |
| Match anything. |
| ***********************************************************************/ |
| enum |
| { |
| dont_care = 1, |
| force_not_newline = 0, |
| force_newline = 2, |
| |
| test_not_newline = 2, |
| test_newline = 3 |
| }; |
| struct re_dot : public re_syntax_base |
| { |
| unsigned char mask; |
| }; |
| |
| /*** struct re_literal ************************************************ |
| A string of literals, following this structure will be an |
| array of characters: charT[length] |
| ***********************************************************************/ |
| struct re_literal : public re_syntax_base |
| { |
| unsigned int length; |
| }; |
| |
| /*** struct re_case ************************************************ |
| Indicates whether we are moving to a case insensive block or not |
| ***********************************************************************/ |
| struct re_case : public re_syntax_base |
| { |
| bool icase; |
| }; |
| |
| /*** struct re_set_long *********************************************** |
| A wide character set of characters, following this structure will be |
| an array of type charT: |
| First csingles null-terminated strings |
| Then 2 * cranges NULL terminated strings |
| Then cequivalents NULL terminated strings |
| ***********************************************************************/ |
| template <class mask_type> |
| struct re_set_long : public re_syntax_base |
| { |
| unsigned int csingles, cranges, cequivalents; |
| mask_type cclasses; |
| mask_type cnclasses; |
| bool isnot; |
| bool singleton; |
| }; |
| |
| /*** struct re_set **************************************************** |
| A set of narrow-characters, matches any of _map which is none-zero |
| ***********************************************************************/ |
| struct re_set : public re_syntax_base |
| { |
| unsigned char _map[1 << CHAR_BIT]; |
| }; |
| |
| /*** struct re_jump *************************************************** |
| Jump to a new location in the machine (not next). |
| ***********************************************************************/ |
| struct re_jump : public re_syntax_base |
| { |
| offset_type alt; // location to jump to |
| }; |
| |
| /*** struct re_alt *************************************************** |
| Jump to a new location in the machine (possibly next). |
| ***********************************************************************/ |
| struct re_alt : public re_jump |
| { |
| unsigned char _map[1 << CHAR_BIT]; // which characters can take the jump |
| unsigned int can_be_null; // true if we match a NULL string |
| }; |
| |
| /*** struct re_repeat ************************************************* |
| Repeat a section of the machine |
| ***********************************************************************/ |
| struct re_repeat : public re_alt |
| { |
| std::size_t min, max; // min and max allowable repeats |
| int state_id; // Unique identifier for this repeat |
| bool leading; // True if this repeat is at the start of the machine (lets us optimize some searches) |
| bool greedy; // True if this is a greedy repeat |
| }; |
| |
| /*** struct re_recurse ************************************************ |
| Recurse to a particular subexpression. |
| **********************************************************************/ |
| struct re_recurse : public re_jump |
| { |
| int state_id; // identifier of first nested repeat within the recursion. |
| }; |
| |
| /*** enum re_jump_size_type ******************************************* |
| Provides compiled size of re_jump structure (allowing for trailing alignment). |
| We provide this so we know how manybytes to insert when constructing the machine |
| (The value of padding_mask is defined in regex_raw_buffer.hpp). |
| ***********************************************************************/ |
| enum re_jump_size_type |
| { |
| re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask), |
| re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask), |
| re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask) |
| }; |
| |
| /*** proc re_is_set_member ********************************************* |
| Forward declaration: we'll need this one later... |
| ***********************************************************************/ |
| |
| template<class charT, class traits> |
| struct regex_data; |
| |
| template <class iterator, class charT, class traits_type, class char_classT> |
| iterator NDNBOOST_REGEX_CALL re_is_set_member(iterator next, |
| iterator last, |
| const re_set_long<char_classT>* set_, |
| const regex_data<charT, traits_type>& e, bool icase); |
| |
| } // namespace re_detail |
| |
| } // namespace ndnboost |
| |
| #ifdef NDNBOOST_MSVC |
| #pragma warning(push) |
| #pragma warning(disable: 4103) |
| #endif |
| #ifdef NDNBOOST_HAS_ABI_HEADERS |
| # include NDNBOOST_ABI_SUFFIX |
| #endif |
| #ifdef NDNBOOST_MSVC |
| #pragma warning(pop) |
| #endif |
| |
| #endif |
| |
| |