include/ndnboost/regex/v4/states.hpp - ndn-cxx - Gitiles

 /*
  *
  * Copyright (c) 1998-2002
  * John Maddock
  *
  * Use, modification and distribution are subject to the
  * Boost Software License, Version 1.0. (See accompanying file
  * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  *
  */

  /*
   *   LOCATION:    see http://www.boost.org for most recent version.
   *   FILE         states.cpp
   *   VERSION      see <ndnboost/version.hpp>
   *   DESCRIPTION: Declares internal state machine structures.
   */

 #ifndef NDNBOOST_REGEX_V4_STATES_HPP
 #define NDNBOOST_REGEX_V4_STATES_HPP

 #ifdef NDNBOOST_MSVC
 #pragma warning(push)
 #pragma warning(disable: 4103)
 #endif
 #ifdef NDNBOOST_HAS_ABI_HEADERS
 #  include NDNBOOST_ABI_PREFIX
 #endif
 #ifdef NDNBOOST_MSVC
 #pragma warning(pop)
 #endif

 namespace ndnboost{
 namespace re_detail{

 /*** mask_type *******************************************************
 Whenever we have a choice of two alternatives, we use an array of bytes
 to indicate which of the two alternatives it is possible to take for any
 given input character.  If mask_take is set, then we can take the next
 state, and if mask_skip is set then we can take the alternative.
 ***********************************************************************/
 enum mask_type
 {
    mask_take = 1,
    mask_skip = 2,
    mask_init = 4,
    mask_any = mask_skip | mask_take,
    mask_all = mask_any
 };

 /*** helpers **********************************************************
 These helpers let us use function overload resolution to detect whether
 we have narrow or wide character strings:
 ***********************************************************************/
 struct _narrow_type{};
 struct _wide_type{};
 template <class charT> struct is_byte;
 template<>             struct is_byte<char>         { typedef _narrow_type width_type; };
 template<>             struct is_byte<unsigned char>{ typedef _narrow_type width_type; };
 template<>             struct is_byte<signed char>  { typedef _narrow_type width_type; };
 template <class charT> struct is_byte               { typedef _wide_type width_type; };

 /*** enum syntax_element_type ******************************************
 Every record in the state machine falls into one of the following types:
 ***********************************************************************/
 enum syntax_element_type
 {
    // start of a marked sub-expression, or perl-style (?...) extension
    syntax_element_startmark = 0,
    // end of a marked sub-expression, or perl-style (?...) extension
    syntax_element_endmark = syntax_element_startmark + 1,
    // any sequence of literal characters
    syntax_element_literal = syntax_element_endmark + 1,
    // start of line assertion: ^
    syntax_element_start_line = syntax_element_literal + 1,
    // end of line assertion $
    syntax_element_end_line = syntax_element_start_line + 1,
    // match any character: .
    syntax_element_wild = syntax_element_end_line + 1,
    // end of expression: we have a match when we get here
    syntax_element_match = syntax_element_wild + 1,
    // perl style word boundary: \b
    syntax_element_word_boundary = syntax_element_match + 1,
    // perl style within word boundary: \B
    syntax_element_within_word = syntax_element_word_boundary + 1,
    // start of word assertion: \<
    syntax_element_word_start = syntax_element_within_word + 1,
    // end of word assertion: \>
    syntax_element_word_end = syntax_element_word_start + 1,
    // start of buffer assertion: \`
    syntax_element_buffer_start = syntax_element_word_end + 1,
    // end of buffer assertion: \'
    syntax_element_buffer_end = syntax_element_buffer_start + 1,
    // backreference to previously matched sub-expression
    syntax_element_backref = syntax_element_buffer_end + 1,
    // either a wide character set [..] or one with multicharacter collating elements:
    syntax_element_long_set = syntax_element_backref + 1,
    // narrow character set: [...]
    syntax_element_set = syntax_element_long_set + 1,
    // jump to a new state in the machine:
    syntax_element_jump = syntax_element_set + 1,
    // choose between two production states:
    syntax_element_alt = syntax_element_jump + 1,
    // a repeat
    syntax_element_rep = syntax_element_alt + 1,
    // match a combining character sequence
    syntax_element_combining = syntax_element_rep + 1,
    // perl style soft buffer end: \z
    syntax_element_soft_buffer_end = syntax_element_combining + 1,
    // perl style continuation: \G
    syntax_element_restart_continue = syntax_element_soft_buffer_end + 1,
    // single character repeats:
    syntax_element_dot_rep = syntax_element_restart_continue + 1,
    syntax_element_char_rep = syntax_element_dot_rep + 1,
    syntax_element_short_set_rep = syntax_element_char_rep + 1,
    syntax_element_long_set_rep = syntax_element_short_set_rep + 1,
    // a backstep for lookbehind repeats:
    syntax_element_backstep = syntax_element_long_set_rep + 1,
    // an assertion that a mark was matched:
    syntax_element_assert_backref = syntax_element_backstep + 1,
    syntax_element_toggle_case = syntax_element_assert_backref + 1,
    // a recursive expression:
    syntax_element_recurse = syntax_element_toggle_case + 1
 };

 #ifdef NDNBOOST_REGEX_DEBUG
 // dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion
 std::ostream& operator<<(std::ostream&, syntax_element_type);
 #endif

 struct re_syntax_base;

 /*** union offset_type ************************************************
 Points to another state in the machine.  During machine construction
 we use integral offsets, but these are converted to pointers before
 execution of the machine.
 ***********************************************************************/
 union offset_type
 {
    re_syntax_base*   p;
    std::ptrdiff_t    i;
 };

 /*** struct re_syntax_base ********************************************
 Base class for all states in the machine.
 ***********************************************************************/
 struct re_syntax_base
 {
    syntax_element_type   type;         // what kind of state this is
    offset_type           next;         // next state in the machine
 };

 /*** struct re_brace **************************************************
 A marked parenthesis.
 ***********************************************************************/
 struct re_brace : public re_syntax_base
 {
    // The index to match, can be zero (don't mark the sub-expression)
    // or negative (for perl style (?...) extentions):
    int index;
    bool icase;
 };

 /*** struct re_dot **************************************************
 Match anything.
 ***********************************************************************/
 enum
 {
    dont_care = 1,
    force_not_newline = 0,
    force_newline = 2,

    test_not_newline = 2,
    test_newline = 3
 };
 struct re_dot : public re_syntax_base
 {
    unsigned char mask;
 };

 /*** struct re_literal ************************************************
 A string of literals, following this structure will be an
 array of characters: charT[length]
 ***********************************************************************/
 struct re_literal : public re_syntax_base
 {
    unsigned int length;
 };

 /*** struct re_case ************************************************
 Indicates whether we are moving to a case insensive block or not
 ***********************************************************************/
 struct re_case : public re_syntax_base
 {
    bool icase;
 };

 /*** struct re_set_long ***********************************************
 A wide character set of characters, following this structure will be
 an array of type charT:
 First csingles null-terminated strings
 Then 2 * cranges NULL terminated strings
 Then cequivalents NULL terminated strings
 ***********************************************************************/
 template <class mask_type>
 struct re_set_long : public re_syntax_base
 {
    unsigned int            csingles, cranges, cequivalents;
    mask_type               cclasses;
    mask_type               cnclasses;
    bool                    isnot;
    bool                    singleton;
 };

 /*** struct re_set ****************************************************
 A set of narrow-characters, matches any of _map which is none-zero
 ***********************************************************************/
 struct re_set : public re_syntax_base
 {
    unsigned char _map[1 << CHAR_BIT];
 };

 /*** struct re_jump ***************************************************
 Jump to a new location in the machine (not next).
 ***********************************************************************/
 struct re_jump : public re_syntax_base
 {
    offset_type     alt;                 // location to jump to
 };

 /*** struct re_alt ***************************************************
 Jump to a new location in the machine (possibly next).
 ***********************************************************************/
 struct re_alt : public re_jump
 {
    unsigned char   _map[1 << CHAR_BIT]; // which characters can take the jump
    unsigned int    can_be_null;         // true if we match a NULL string
 };

 /*** struct re_repeat *************************************************
 Repeat a section of the machine
 ***********************************************************************/
 struct re_repeat : public re_alt
 {
    std::size_t   min, max;  // min and max allowable repeats
    int           state_id;        // Unique identifier for this repeat
    bool          leading;   // True if this repeat is at the start of the machine (lets us optimize some searches)
    bool          greedy;    // True if this is a greedy repeat
 };

 /*** struct re_recurse ************************************************
 Recurse to a particular subexpression.
 **********************************************************************/
 struct re_recurse : public re_jump
 {
    int state_id;             // identifier of first nested repeat within the recursion.
 };

 /*** enum re_jump_size_type *******************************************
 Provides compiled size of re_jump structure (allowing for trailing alignment).
 We provide this so we know how manybytes to insert when constructing the machine
 (The value of padding_mask is defined in regex_raw_buffer.hpp).
 ***********************************************************************/
 enum re_jump_size_type
 {
    re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask),
    re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask),
    re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask)
 };

 /*** proc re_is_set_member *********************************************
 Forward declaration: we'll need this one later...
 ***********************************************************************/

 template<class charT, class traits>
 struct regex_data;

 template <class iterator, class charT, class traits_type, class char_classT>
 iterator NDNBOOST_REGEX_CALL re_is_set_member(iterator next,
                           iterator last,
                           const re_set_long<char_classT>* set_,
                           const regex_data<charT, traits_type>& e, bool icase);

 } // namespace re_detail

 } // namespace ndnboost

 #ifdef NDNBOOST_MSVC
 #pragma warning(push)
 #pragma warning(disable: 4103)
 #endif
 #ifdef NDNBOOST_HAS_ABI_HEADERS
 #  include NDNBOOST_ABI_SUFFIX
 #endif
 #ifdef NDNBOOST_MSVC
 #pragma warning(pop)
 #endif

 #endif
	/*
	*
	* Copyright (c) 1998-2002
	* John Maddock
	*
	* Use, modification and distribution are subject to the
	* Boost Software License, Version 1.0. (See accompanying file
	* LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
	*
	*/

	/*
	* LOCATION: see http://www.boost.org for most recent version.
	* FILE states.cpp
	* VERSION see <ndnboost/version.hpp>
	* DESCRIPTION: Declares internal state machine structures.
	*/

	#ifndef NDNBOOST_REGEX_V4_STATES_HPP
	#define NDNBOOST_REGEX_V4_STATES_HPP

	#ifdef NDNBOOST_MSVC
	#pragma warning(push)
	#pragma warning(disable: 4103)
	#endif
	#ifdef NDNBOOST_HAS_ABI_HEADERS
	# include NDNBOOST_ABI_PREFIX
	#endif
	#ifdef NDNBOOST_MSVC
	#pragma warning(pop)
	#endif

	namespace ndnboost{
	namespace re_detail{

	/* mask_type *****************************************************
	Whenever we have a choice of two alternatives, we use an array of bytes
	to indicate which of the two alternatives it is possible to take for any
	given input character. If mask_take is set, then we can take the next
	state, and if mask_skip is set then we can take the alternative.
	***********************************************************************/
	enum mask_type
	{
	mask_take = 1,
	mask_skip = 2,
	mask_init = 4,
	mask_any = mask_skip \| mask_take,
	mask_all = mask_any
	};

	/* helpers ********************************************************
	These helpers let us use function overload resolution to detect whether
	we have narrow or wide character strings:
	***********************************************************************/
	struct _narrow_type{};
	struct _wide_type{};
	template <class charT> struct is_byte;
	template<> struct is_byte<char> { typedef _narrow_type width_type; };
	template<> struct is_byte<unsigned char>{ typedef _narrow_type width_type; };
	template<> struct is_byte<signed char> { typedef _narrow_type width_type; };
	template <class charT> struct is_byte { typedef _wide_type width_type; };

	/* enum syntax_element_type ****************************************
	Every record in the state machine falls into one of the following types:
	***********************************************************************/
	enum syntax_element_type
	{
	// start of a marked sub-expression, or perl-style (?...) extension
	syntax_element_startmark = 0,
	// end of a marked sub-expression, or perl-style (?...) extension
	syntax_element_endmark = syntax_element_startmark + 1,
	// any sequence of literal characters
	syntax_element_literal = syntax_element_endmark + 1,
	// start of line assertion: ^
	syntax_element_start_line = syntax_element_literal + 1,
	// end of line assertion $
	syntax_element_end_line = syntax_element_start_line + 1,
	// match any character: .
	syntax_element_wild = syntax_element_end_line + 1,
	// end of expression: we have a match when we get here
	syntax_element_match = syntax_element_wild + 1,
	// perl style word boundary: \b
	syntax_element_word_boundary = syntax_element_match + 1,
	// perl style within word boundary: \B
	syntax_element_within_word = syntax_element_word_boundary + 1,
	// start of word assertion: \<
	syntax_element_word_start = syntax_element_within_word + 1,
	// end of word assertion: \>
	syntax_element_word_end = syntax_element_word_start + 1,
	// start of buffer assertion: \`
	syntax_element_buffer_start = syntax_element_word_end + 1,
	// end of buffer assertion: \'
	syntax_element_buffer_end = syntax_element_buffer_start + 1,
	// backreference to previously matched sub-expression
	syntax_element_backref = syntax_element_buffer_end + 1,
	// either a wide character set [..] or one with multicharacter collating elements:
	syntax_element_long_set = syntax_element_backref + 1,
	// narrow character set: [...]
	syntax_element_set = syntax_element_long_set + 1,
	// jump to a new state in the machine:
	syntax_element_jump = syntax_element_set + 1,
	// choose between two production states:
	syntax_element_alt = syntax_element_jump + 1,
	// a repeat
	syntax_element_rep = syntax_element_alt + 1,
	// match a combining character sequence
	syntax_element_combining = syntax_element_rep + 1,
	// perl style soft buffer end: \z
	syntax_element_soft_buffer_end = syntax_element_combining + 1,
	// perl style continuation: \G
	syntax_element_restart_continue = syntax_element_soft_buffer_end + 1,
	// single character repeats:
	syntax_element_dot_rep = syntax_element_restart_continue + 1,
	syntax_element_char_rep = syntax_element_dot_rep + 1,
	syntax_element_short_set_rep = syntax_element_char_rep + 1,
	syntax_element_long_set_rep = syntax_element_short_set_rep + 1,
	// a backstep for lookbehind repeats:
	syntax_element_backstep = syntax_element_long_set_rep + 1,
	// an assertion that a mark was matched:
	syntax_element_assert_backref = syntax_element_backstep + 1,
	syntax_element_toggle_case = syntax_element_assert_backref + 1,
	// a recursive expression:
	syntax_element_recurse = syntax_element_toggle_case + 1
	};

	#ifdef NDNBOOST_REGEX_DEBUG
	// dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion
	std::ostream& operator<<(std::ostream&, syntax_element_type);
	#endif

	struct re_syntax_base;

	/* union offset_type **********************************************
	Points to another state in the machine. During machine construction
	we use integral offsets, but these are converted to pointers before
	execution of the machine.
	***********************************************************************/
	union offset_type
	{
	re_syntax_base* p;
	std::ptrdiff_t i;
	};

	/* struct re_syntax_base ******************************************
	Base class for all states in the machine.
	***********************************************************************/
	struct re_syntax_base
	{
	syntax_element_type type; // what kind of state this is
	offset_type next; // next state in the machine
	};

	/* struct re_brace ************************************************
	A marked parenthesis.
	***********************************************************************/
	struct re_brace : public re_syntax_base
	{
	// The index to match, can be zero (don't mark the sub-expression)
	// or negative (for perl style (?...) extentions):
	int index;
	bool icase;
	};

	/* struct re_dot ************************************************
	Match anything.
	***********************************************************************/
	enum
	{
	dont_care = 1,
	force_not_newline = 0,
	force_newline = 2,

	test_not_newline = 2,
	test_newline = 3
	};
	struct re_dot : public re_syntax_base
	{
	unsigned char mask;
	};

	/* struct re_literal **********************************************
	A string of literals, following this structure will be an
	array of characters: charT[length]
	***********************************************************************/
	struct re_literal : public re_syntax_base
	{
	unsigned int length;
	};

	/* struct re_case **********************************************
	Indicates whether we are moving to a case insensive block or not
	***********************************************************************/
	struct re_case : public re_syntax_base
	{
	bool icase;
	};

	/* struct re_set_long *********************************************
	A wide character set of characters, following this structure will be
	an array of type charT:
	First csingles null-terminated strings
	Then 2 * cranges NULL terminated strings
	Then cequivalents NULL terminated strings
	***********************************************************************/
	template <class mask_type>
	struct re_set_long : public re_syntax_base
	{
	unsigned int csingles, cranges, cequivalents;
	mask_type cclasses;
	mask_type cnclasses;
	bool isnot;
	bool singleton;
	};

	/* struct re_set **************************************************
	A set of narrow-characters, matches any of _map which is none-zero
	***********************************************************************/
	struct re_set : public re_syntax_base
	{
	unsigned char _map[1 << CHAR_BIT];
	};

	/* struct re_jump *************************************************
	Jump to a new location in the machine (not next).
	***********************************************************************/
	struct re_jump : public re_syntax_base
	{
	offset_type alt; // location to jump to
	};

	/* struct re_alt *************************************************
	Jump to a new location in the machine (possibly next).
	***********************************************************************/
	struct re_alt : public re_jump
	{
	unsigned char _map[1 << CHAR_BIT]; // which characters can take the jump
	unsigned int can_be_null; // true if we match a NULL string
	};

	/* struct re_repeat ***********************************************
	Repeat a section of the machine
	***********************************************************************/
	struct re_repeat : public re_alt
	{
	std::size_t min, max; // min and max allowable repeats
	int state_id; // Unique identifier for this repeat
	bool leading; // True if this repeat is at the start of the machine (lets us optimize some searches)
	bool greedy; // True if this is a greedy repeat
	};

	/* struct re_recurse **********************************************
	Recurse to a particular subexpression.
	**********************************************************************/
	struct re_recurse : public re_jump
	{
	int state_id; // identifier of first nested repeat within the recursion.
	};

	/* enum re_jump_size_type *****************************************
	Provides compiled size of re_jump structure (allowing for trailing alignment).
	We provide this so we know how manybytes to insert when constructing the machine
	(The value of padding_mask is defined in regex_raw_buffer.hpp).
	***********************************************************************/
	enum re_jump_size_type
	{
	re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask),
	re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask),
	re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask)
	};

	/* proc re_is_set_member *******************************************
	Forward declaration: we'll need this one later...
	***********************************************************************/

	template<class charT, class traits>
	struct regex_data;

	template <class iterator, class charT, class traits_type, class char_classT>
	iterator NDNBOOST_REGEX_CALL re_is_set_member(iterator next,
	iterator last,
	const re_set_long<char_classT>* set_,
	const regex_data<charT, traits_type>& e, bool icase);

	} // namespace re_detail

	} // namespace ndnboost

	#ifdef NDNBOOST_MSVC
	#pragma warning(push)
	#pragma warning(disable: 4103)
	#endif
	#ifdef NDNBOOST_HAS_ABI_HEADERS
	# include NDNBOOST_ABI_SUFFIX
	#endif
	#ifdef NDNBOOST_MSVC
	#pragma warning(pop)
	#endif

	#endif