wibble  1.1
parse.h
Go to the documentation of this file.
1 // -*- C++ -*- (c) 2011 Petr Rockai
2 // (c) 2013 Jan Kriho
3 // Support code for writing backtracking recursive-descent parsers
4 #include <string>
5 #include <cassert>
6 #include <deque>
7 #include <vector>
8 #include <map>
9 #include <queue>
10 
11 #include <wibble/regexp.h>
12 
13 #ifndef WIBBLE_PARSE_H
14 #define WIBBLE_PARSE_H
15 
16 namespace wibble {
17 
18 struct Position {
19  std::string source;
20  int line;
21  int column;
22  Position() : source("-"), line(1), column(1) {}
23  bool operator==( const Position &o ) const {
24  return o.source == source && o.line == line && o.column == column;
25  }
26 };
27 
28 template< typename _Id >
29 struct Token {
30  typedef _Id Id;
31  Id id;
32  std::string data;
34  bool _valid;
35 
36  bool valid() { return _valid; }
37 
38  Token( Id _id, char c ) : id( _id ), _valid( true ) {
39  data.push_back( c );
40  }
41 
42  Token( Id _id, std::string d ) : id( _id ), data( d ), _valid( true ) {}
43  Token() : id( Id( 0 ) ), _valid( false ) {}
44 
45  bool operator==( const Token &o ) const {
46  return o._valid == _valid && o.id == id && o.data == data && o.position == position;
47  }
48 
49 };
50 
51 template< typename X, typename Y >
52 inline std::ostream &operator<<( std::ostream &o, const std::pair< X, Y > &x ) {
53  return o << "(" << x.first << ", " << x.second << ")";
54 }
55 
56 /*
57  * This is a SLOW lexer (at least compared to systems like lex/flex, which
58  * build a finite-state machine to make decisions per input character. We could
59  * do that in theory, but it would still be slow, since we would be in effect
60  * interpreting the FSM anyway, while (f)lex uses an optimising compiler like
61  * gcc. So while this is friendly and flexible, it won't give you a fast
62  * scanner.
63  */
64 template< typename Token, typename Stream >
65 struct Lexer {
66  Stream &stream;
67  typedef std::deque< char > Window;
70 
72 
73  void shift() {
74  assert( !stream.eof() );
75  std::string r = stream.remove();
76  std::copy( r.begin(), r.end(), std::back_inserter( _window ) );
77  }
78 
79  bool eof() {
80  return _window.empty() && stream.eof();
81  }
82 
83  std::string window( unsigned n ) {
84  bool valid = ensure_window( n );
85  assert( valid );
86  static_cast< void >( valid );
87  std::deque< char >::iterator b = _window.begin(), e = b;
88  e += n;
89  return std::string( b, e );
90  }
91 
92  bool ensure_window( unsigned n ) {
93  while ( _window.size() < n && !stream.eof() )
94  shift();
95  return _window.size() >= n;
96  }
97 
98  void consume( int n ) {
99  for( int i = 0; i < n; ++i ) {
100  if ( _window[i] == '\n' ) {
101  current.line ++;
102  current.column = 1;
103  } else
104  current.column ++;
105  }
106  std::deque< char >::iterator b = _window.begin(), e = b;
107  e += n;
108  _window.erase( b, e );
109  }
110 
111  void consume( const std::string &s ) {
112  consume( s.length() );
113  }
114 
115  void consume( const Token &t ) {
116  // std::cerr << "consuming " << t << std::endl;
117  consume( t.data );
118  }
119 
120  void keep( typename Token::Id id, const std::string &data ) {
121  Token t( id, data );
122  t.position = current;
123  if ( t.data.length() > _match.data.length() )
124  _match = t;
125  }
126 
127  template< typename I >
128  bool match( I begin, I end ) {
129  if ( !ensure_window( end - begin ) )
130  return false;
131  return std::equal( begin, end, _window.begin() );
132  }
133 
134  void match( const std::string &data, typename Token::Id id ) {
135  if ( match( data.begin(), data.end() ) )
136  return keep( id, data );
137  }
138 
139  void match( Regexp &r, typename Token::Id id ) {
140  unsigned n = 1, max = 0;
141  while ( r.match( window( n ) ) ) {
142  if ( max && max == r.matchLength( 0 ) )
143  return keep( id, window( max ) );
144  max = r.matchLength( 0 );
145  ++ n;
146  }
147  }
148 
149  void match( int (*first)(int), int (*rest)(int), typename Token::Id id )
150  {
151  unsigned n = 1;
152 
153  if ( !ensure_window( 1 ) )
154  return;
155 
156  if ( !first( _window[0] ) )
157  return;
158 
159  while ( true ) {
160  ++ n;
161  if ( ensure_window( n ) && rest( _window[ n - 1 ] ) )
162  continue;
163  return keep( id, window( n - 1 ) );
164  }
165  }
166 
167  void match( const std::string &from, const std::string &to, typename Token::Id id ) {
168  if ( !match( from.begin(), from.end() ) )
169  return;
170 
171  Window::iterator where = _window.begin();
172  int n = from.length();
173  where += n;
174  while ( true ) {
175  if ( !ensure_window( n + to.length() ) )
176  return;
177 
178  if ( std::equal( to.begin(), to.end(), where ) )
179  return keep( id, window( n + to.length() ) );
180  ++ where;
181  ++ n;
182  }
183  }
184 
185  void skipWhitespace() {
186  while ( !eof() && isspace( window( 1 )[ 0 ] ) )
187  consume( 1 );
188  }
189 
191  Token t;
192  std::swap( t, _match );
193  consume( t );
194  return t;
195  }
196 
197  Lexer( Stream &s ) : stream( s ) {}
198 };
199 
200 template< typename Token, typename Stream >
202 {
203  Stream *_stream;
204  Stream &stream() { assert( _stream ); return *_stream; }
205 
206  std::deque< Token > window;
208  int position;
209  std::string name;
211  std::vector< This > children;
212 
213  struct Fail {
214  enum Type { Syntax, Semantic };
215 
216  int position;
217  const char *expected;
219 
220  bool operator<( const Fail &other ) const {
221  return position > other.position;
222  }
223 
224  Fail( const char *err, int pos, Type t = Syntax) {
225  expected = err;
226  position = pos;
227  type = t;
228  }
229  ~Fail() throw () {}
230  };
231 
232  std::priority_queue< Fail > failures;
233 
234  void clearErrors() {
235  failures = std::priority_queue< Fail >();
236  }
237 
238  void error( std::ostream &o, std::string prefix, const Fail &fail ) {
239  Token t = window[ fail.position ];
240  switch ( fail.type ) {
241  case Fail::Syntax:
242  o << prefix
243  << "expected " << fail.expected
244  << " at line " << t.position.line
245  << ", column " << t.position.column
246  << ", but seen " << Token::tokenName[ t.id ] << " '" << t.data << "'"
247  << std::endl;
248  return;
249  case Fail::Semantic:
250  o << prefix
251  << fail.expected
252  << " at line " << t.position.line
253  << ", column " << t.position.column
254  << std::endl;
255  return;
256  }
257  }
258 
259  void errors( std::ostream &o ) {
260  for ( typename std::vector< This >::iterator pc = children.begin(); pc != children.end(); ++pc )
261  pc->errors( o );
262 
263  if ( failures.empty() )
264  return;
265 
266  std::string prefix;
267  switch ( failures.top().type ) {
268  case Fail::Syntax:
269  o << "parse";
270  break;
271  case Fail::Semantic:
272  o << "semantic";
273  break;
274  }
275  o << " error in context " << name << ": ";
276  if ( failures.size() > 1 ) {
277  o << failures.size() << " rightmost alternatives:" << std::endl;
278  prefix = " ";
279  }
280  while ( !failures.empty() ) {
281  error( o, prefix, failures.top() );
282  failures.pop();
283  }
284  }
285 
286  Token remove() {
287  if ( int( window.size() ) <= window_pos ) {
288  Token t;
289  do {
290  t = stream().remove();
291  } while ( t.id == Token::Comment ); // XXX
292  window.push_back( t );
293  }
294 
295  ++ window_pos;
296  ++ position;
297  return window[ window_pos - 1 ];
298  }
299 
300  void rewind( int n ) {
301  assert( n >= 0 );
302  assert( n <= window_pos );
303  window_pos -= n;
304  position -= n;
305  }
306 
307  This & createChild( Stream &s, std::string name ) {
308  children.push_back( This( s, name ) );
309  return children.back();
310  }
311 
312  ParseContext( Stream &s, std::string name )
313  : _stream( &s ), window_pos( 0 ), position( 0 ), name( name )
314  {}
315 };
316 
317 template< typename Token, typename Stream >
318 struct Parser {
319  typedef typename Token::Id TokenId;
322  typedef typename Context::Fail Fail;
323  typedef typename Fail::Type FailType;
325 
326  bool valid() const {
327  return ctx;
328  }
329 
331  assert( ctx );
332  return *ctx;
333  }
334 
335  int position() {
336  return context().position;
337  }
338 
339  void rewind( int i ) {
340  context().rewind( i );
342  }
343 
344  void fail( const char *what, FailType type = FailType::Syntax ) __attribute__((noreturn))
345  {
346  Fail f( what, _position, type );
347  context().failures.push( f );
348  while ( context().failures.top().position < _position )
349  context().failures.pop();
350  throw f;
351  }
352 
353  void semicolon() {
354  Token t = eat();
355  if ( t.id == Token::Punctuation && t.data == ";" )
356  return;
357 
358  rewind( 1 );
359  fail( "semicolon" );
360  }
361 
362  void colon() {
363  Token t = eat();
364  if ( t.id == Token::Punctuation && t.data == ":" )
365  return;
366 
367  rewind( 1 );
368  fail( "colon" );
369  }
370 
371  Token eat( TokenId id ) {
372  Token t = eat( false );
373  if ( t.id == id )
374  return t;
375  rewind( 1 );
376  fail( Token::tokenName[id].c_str() );
377  }
378 
379 
380 #if __cplusplus >= 201103L
381  template< typename F >
382  void either( void (F::*f)() ) {
383  (static_cast< F* >( this )->*f)();
384  }
385 
386  template< typename F, typename... Args >
387  void either( F f, Args... args ) {
388  if ( maybe( f ) )
389  return;
390  either( args... );
391  }
392 #else
393  template< typename F, typename G >
394  void either( F f, void (G::*g)() ) {
395  if ( maybe( f ) )
396  return;
397  (static_cast< G* >( this )->*g)();
398  }
399 #endif
400 
401 
402 #if __cplusplus >= 201103L
403  template< typename F, typename... Args >
404  bool maybe( F f, Args... args ) {
405  if ( maybe( f ) )
406  return true;
407  return maybe( args... );
408  }
409 
410 #else
411  template< typename F, typename G >
412  bool maybe( F f, G g ) {
413  if ( maybe( f ) )
414  return true;
415  return maybe( g );
416  }
417 
418 
419  template< typename F, typename G, typename H >
420  bool maybe( F f, G g, H h ) {
421  if ( maybe( f ) )
422  return true;
423  if ( maybe( g ) )
424  return true;
425  return maybe( h );
426  }
427 
428 #endif
429 
430  template< typename F >
431  bool maybe( void (F::*f)() ) {
432  int fallback = position();
433  try {
434  (static_cast< F* >( this )->*f)();
435  return true;
436  } catch ( Fail fail ) {
437  rewind( position() - fallback );
438  return false;
439  }
440  }
441 
442  bool maybe( TokenId id ) {
443  int fallback = position();
444  try {
445  eat( id );
446  return true;
447  } catch (Fail) {
448  rewind( position() - fallback );
449  return false;
450  }
451  }
452 
453  template< typename T, typename I >
454  void many( I i ) {
455  int fallback = 0;
456  try {
457  while ( true ) {
458  fallback = position();
459  *i++ = T( context() );
460  }
461  } catch (Fail) {
462  rewind( position() - fallback );
463  }
464  }
465 #if __cplusplus >= 201103L
466  template< typename F >
467  bool arbitrary( F f ) {
468  return maybe( f );
469  }
470 
471  template< typename F, typename... Args >
472  bool arbitrary( F f, Args... args ) {
473  bool retval = arbitrary( args... );
474  retval |= maybe( f );
475  retval |= arbitrary( args... );
476  return retval;
477  }
478 #endif
479 
480  template< typename T, typename I >
481  void list( I i, TokenId sep ) {
482  do {
483  *i++ = T( context() );
484  } while ( next( sep ) );
485  }
486 
487  template< typename T, typename I, typename F >
488  void list( I i, void (F::*sep)() ) {
489  int fallback = position();
490  try {
491  while ( true ) {
492  *i++ = T( context() );
493  fallback = position();
494  (static_cast< F* >( this )->*sep)();
495  }
496  } catch(Fail) {
497  rewind( position() - fallback );
498  }
499  }
500 
501  template< typename T, typename I >
502  void list( I i, TokenId first, TokenId sep, TokenId last ) {
503  eat( first );
504  list< T >( i, sep );
505  eat( last );
506  }
507 
508  Token eat( bool _fail = true ) {
509  Token t = context().remove();
511  if ( _fail && !t.valid() ) {
512  rewind( 1 );
513  fail( "valid token" );
514  }
515  return t;
516  }
517 
518  bool next( TokenId id ) {
519  Token t = eat( false );
520  if ( t.id == id )
521  return true;
522  rewind( 1 );
523  return false;
524  }
525 
526  Parser( Context &c ) : ctx( &c ) {}
527  Parser() : ctx( 0 ) {}
528 
529 };
530 
531 }
532 
533 #endif