blob: 0698fa0eec49859734e4b73ac84fdc793e851351 [file] [log] [blame]
Jeff Thompson86b6d642013-10-17 15:01:56 -07001/*
2 *
3 * Copyright (c) 1998-2002
4 * John Maddock
5 *
6 * Use, modification and distribution are subject to the
7 * Boost Software License, Version 1.0. (See accompanying file
8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9 *
10 */
11
12 /*
13 * LOCATION: see http://www.boost.org for most recent version.
14 * FILE states.cpp
15 * VERSION see <ndnboost/version.hpp>
16 * DESCRIPTION: Declares internal state machine structures.
17 */
18
19#ifndef NDNBOOST_REGEX_V4_STATES_HPP
20#define NDNBOOST_REGEX_V4_STATES_HPP
21
22#ifdef NDNBOOST_MSVC
23#pragma warning(push)
24#pragma warning(disable: 4103)
25#endif
26#ifdef NDNBOOST_HAS_ABI_HEADERS
27# include NDNBOOST_ABI_PREFIX
28#endif
29#ifdef NDNBOOST_MSVC
30#pragma warning(pop)
31#endif
32
33namespace ndnboost{
34namespace re_detail{
35
36/*** mask_type *******************************************************
37Whenever we have a choice of two alternatives, we use an array of bytes
38to indicate which of the two alternatives it is possible to take for any
39given input character. If mask_take is set, then we can take the next
40state, and if mask_skip is set then we can take the alternative.
41***********************************************************************/
42enum mask_type
43{
44 mask_take = 1,
45 mask_skip = 2,
46 mask_init = 4,
47 mask_any = mask_skip | mask_take,
48 mask_all = mask_any
49};
50
51/*** helpers **********************************************************
52These helpers let us use function overload resolution to detect whether
53we have narrow or wide character strings:
54***********************************************************************/
55struct _narrow_type{};
56struct _wide_type{};
57template <class charT> struct is_byte;
58template<> struct is_byte<char> { typedef _narrow_type width_type; };
59template<> struct is_byte<unsigned char>{ typedef _narrow_type width_type; };
60template<> struct is_byte<signed char> { typedef _narrow_type width_type; };
61template <class charT> struct is_byte { typedef _wide_type width_type; };
62
63/*** enum syntax_element_type ******************************************
64Every record in the state machine falls into one of the following types:
65***********************************************************************/
66enum syntax_element_type
67{
68 // start of a marked sub-expression, or perl-style (?...) extension
69 syntax_element_startmark = 0,
70 // end of a marked sub-expression, or perl-style (?...) extension
71 syntax_element_endmark = syntax_element_startmark + 1,
72 // any sequence of literal characters
73 syntax_element_literal = syntax_element_endmark + 1,
74 // start of line assertion: ^
75 syntax_element_start_line = syntax_element_literal + 1,
76 // end of line assertion $
77 syntax_element_end_line = syntax_element_start_line + 1,
78 // match any character: .
79 syntax_element_wild = syntax_element_end_line + 1,
80 // end of expression: we have a match when we get here
81 syntax_element_match = syntax_element_wild + 1,
82 // perl style word boundary: \b
83 syntax_element_word_boundary = syntax_element_match + 1,
84 // perl style within word boundary: \B
85 syntax_element_within_word = syntax_element_word_boundary + 1,
86 // start of word assertion: \<
87 syntax_element_word_start = syntax_element_within_word + 1,
88 // end of word assertion: \>
89 syntax_element_word_end = syntax_element_word_start + 1,
90 // start of buffer assertion: \`
91 syntax_element_buffer_start = syntax_element_word_end + 1,
92 // end of buffer assertion: \'
93 syntax_element_buffer_end = syntax_element_buffer_start + 1,
94 // backreference to previously matched sub-expression
95 syntax_element_backref = syntax_element_buffer_end + 1,
96 // either a wide character set [..] or one with multicharacter collating elements:
97 syntax_element_long_set = syntax_element_backref + 1,
98 // narrow character set: [...]
99 syntax_element_set = syntax_element_long_set + 1,
100 // jump to a new state in the machine:
101 syntax_element_jump = syntax_element_set + 1,
102 // choose between two production states:
103 syntax_element_alt = syntax_element_jump + 1,
104 // a repeat
105 syntax_element_rep = syntax_element_alt + 1,
106 // match a combining character sequence
107 syntax_element_combining = syntax_element_rep + 1,
108 // perl style soft buffer end: \z
109 syntax_element_soft_buffer_end = syntax_element_combining + 1,
110 // perl style continuation: \G
111 syntax_element_restart_continue = syntax_element_soft_buffer_end + 1,
112 // single character repeats:
113 syntax_element_dot_rep = syntax_element_restart_continue + 1,
114 syntax_element_char_rep = syntax_element_dot_rep + 1,
115 syntax_element_short_set_rep = syntax_element_char_rep + 1,
116 syntax_element_long_set_rep = syntax_element_short_set_rep + 1,
117 // a backstep for lookbehind repeats:
118 syntax_element_backstep = syntax_element_long_set_rep + 1,
119 // an assertion that a mark was matched:
120 syntax_element_assert_backref = syntax_element_backstep + 1,
121 syntax_element_toggle_case = syntax_element_assert_backref + 1,
122 // a recursive expression:
123 syntax_element_recurse = syntax_element_toggle_case + 1
124};
125
126#ifdef NDNBOOST_REGEX_DEBUG
127// dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion
128std::ostream& operator<<(std::ostream&, syntax_element_type);
129#endif
130
131struct re_syntax_base;
132
133/*** union offset_type ************************************************
134Points to another state in the machine. During machine construction
135we use integral offsets, but these are converted to pointers before
136execution of the machine.
137***********************************************************************/
138union offset_type
139{
140 re_syntax_base* p;
141 std::ptrdiff_t i;
142};
143
144/*** struct re_syntax_base ********************************************
145Base class for all states in the machine.
146***********************************************************************/
147struct re_syntax_base
148{
149 syntax_element_type type; // what kind of state this is
150 offset_type next; // next state in the machine
151};
152
153/*** struct re_brace **************************************************
154A marked parenthesis.
155***********************************************************************/
156struct re_brace : public re_syntax_base
157{
158 // The index to match, can be zero (don't mark the sub-expression)
159 // or negative (for perl style (?...) extentions):
160 int index;
161 bool icase;
162};
163
164/*** struct re_dot **************************************************
165Match anything.
166***********************************************************************/
167enum
168{
169 dont_care = 1,
170 force_not_newline = 0,
171 force_newline = 2,
172
173 test_not_newline = 2,
174 test_newline = 3
175};
176struct re_dot : public re_syntax_base
177{
178 unsigned char mask;
179};
180
181/*** struct re_literal ************************************************
182A string of literals, following this structure will be an
183array of characters: charT[length]
184***********************************************************************/
185struct re_literal : public re_syntax_base
186{
187 unsigned int length;
188};
189
190/*** struct re_case ************************************************
191Indicates whether we are moving to a case insensive block or not
192***********************************************************************/
193struct re_case : public re_syntax_base
194{
195 bool icase;
196};
197
198/*** struct re_set_long ***********************************************
199A wide character set of characters, following this structure will be
200an array of type charT:
201First csingles null-terminated strings
202Then 2 * cranges NULL terminated strings
203Then cequivalents NULL terminated strings
204***********************************************************************/
205template <class mask_type>
206struct re_set_long : public re_syntax_base
207{
208 unsigned int csingles, cranges, cequivalents;
209 mask_type cclasses;
210 mask_type cnclasses;
211 bool isnot;
212 bool singleton;
213};
214
215/*** struct re_set ****************************************************
216A set of narrow-characters, matches any of _map which is none-zero
217***********************************************************************/
218struct re_set : public re_syntax_base
219{
220 unsigned char _map[1 << CHAR_BIT];
221};
222
223/*** struct re_jump ***************************************************
224Jump to a new location in the machine (not next).
225***********************************************************************/
226struct re_jump : public re_syntax_base
227{
228 offset_type alt; // location to jump to
229};
230
231/*** struct re_alt ***************************************************
232Jump to a new location in the machine (possibly next).
233***********************************************************************/
234struct re_alt : public re_jump
235{
236 unsigned char _map[1 << CHAR_BIT]; // which characters can take the jump
237 unsigned int can_be_null; // true if we match a NULL string
238};
239
240/*** struct re_repeat *************************************************
241Repeat a section of the machine
242***********************************************************************/
243struct re_repeat : public re_alt
244{
245 std::size_t min, max; // min and max allowable repeats
246 int state_id; // Unique identifier for this repeat
247 bool leading; // True if this repeat is at the start of the machine (lets us optimize some searches)
248 bool greedy; // True if this is a greedy repeat
249};
250
251/*** struct re_recurse ************************************************
252Recurse to a particular subexpression.
253**********************************************************************/
254struct re_recurse : public re_jump
255{
256 int state_id; // identifier of first nested repeat within the recursion.
257};
258
259/*** enum re_jump_size_type *******************************************
260Provides compiled size of re_jump structure (allowing for trailing alignment).
261We provide this so we know how manybytes to insert when constructing the machine
262(The value of padding_mask is defined in regex_raw_buffer.hpp).
263***********************************************************************/
264enum re_jump_size_type
265{
266 re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask),
267 re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask),
268 re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask)
269};
270
271/*** proc re_is_set_member *********************************************
272Forward declaration: we'll need this one later...
273***********************************************************************/
274
275template<class charT, class traits>
276struct regex_data;
277
278template <class iterator, class charT, class traits_type, class char_classT>
279iterator NDNBOOST_REGEX_CALL re_is_set_member(iterator next,
280 iterator last,
281 const re_set_long<char_classT>* set_,
282 const regex_data<charT, traits_type>& e, bool icase);
283
284} // namespace re_detail
285
286} // namespace ndnboost
287
288#ifdef NDNBOOST_MSVC
289#pragma warning(push)
290#pragma warning(disable: 4103)
291#endif
292#ifdef NDNBOOST_HAS_ABI_HEADERS
293# include NDNBOOST_ABI_SUFFIX
294#endif
295#ifdef NDNBOOST_MSVC
296#pragma warning(pop)
297#endif
298
299#endif
300
301