GNU Octave  4.2.1
A high-level interpreted language, primarily intended for numerical computations, mostly compatible with Matlab
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
lo-regexp.cc
Go to the documentation of this file.
1 /*
2 
3 Copyright (C) 2012 John W. Eaton
4 Copyright (C) 2005-2016 David Bateman
5 Copyright (C) 2002-2005 Paul Kienzle
6 
7 This file is part of Octave.
8 
9 Octave is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 3 of the License, or (at your
12 option) any later version.
13 
14 Octave is distributed in the hope that it will be useful, but WITHOUT
15 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 for more details.
18 
19 You should have received a copy of the GNU General Public License
20 along with Octave; see the file COPYING. If not, see
21 <http://www.gnu.org/licenses/>.
22 
23 */
24 
25 #if defined (HAVE_CONFIG_H)
26 # include "config.h"
27 #endif
28 
29 #include <list>
30 #include <sstream>
31 #include <string>
32 #include <vector>
33 
34 #if defined (HAVE_PCRE_H)
35 # include <pcre.h>
36 #elif defined (HAVE_PCRE_PCRE_H)
37 # include <pcre/pcre.h>
38 #endif
39 
40 #include "Matrix.h"
41 #include "base-list.h"
42 #include "lo-error.h"
43 #include "oct-locbuf.h"
44 #include "quit.h"
45 #include "lo-regexp.h"
46 #include "str-vec.h"
47 
48 // Define the maximum number of retries for a pattern
49 // that possibly results in an infinite recursion.
50 #define PCRE_MATCHLIMIT_MAX 10
51 
52 // FIXME: should this be configurable?
53 #define MAXLOOKBEHIND 10
54 
55 static bool lookbehind_warned = false;
56 
57 // FIXME: don't bother collecting and composing return values
58 // the user doesn't want.
59 
60 namespace octave
61 {
62  void
63  regexp::free (void)
64  {
65  if (data)
66  pcre_free (static_cast<pcre *> (data));
67  }
68 
69  void
71  {
72  // If we had a previously compiled pattern, release it.
73  free ();
74 
75  size_t max_length = MAXLOOKBEHIND;
76 
77  size_t pos = 0;
78  size_t new_pos;
79  int inames = 0;
80  std::ostringstream buf;
81 
82  while ((new_pos = pattern.find ("(?", pos)) != std::string::npos)
83  {
84  if (pattern.at (new_pos + 2) == '<'
85  && !(pattern.at (new_pos + 3) == '='
86  || pattern.at (new_pos + 3) == '!'))
87  {
88  // The syntax of named tokens in pcre is "(?P<name>...)" while
89  // we need a syntax "(?<name>...)", so fix that here. Also an
90  // expression like
91  // "(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)"
92  // should be perfectly legal, while pcre does not allow the same
93  // named token name on both sides of the alternative. Also fix
94  // that here by replacing name tokens by dummy names, and dealing
95  // with the dummy names later.
96 
97  size_t tmp_pos = pattern.find_first_of ('>', new_pos);
98 
99  if (tmp_pos == std::string::npos)
100  (*current_liboctave_error_handler)
101  ("regexp: syntax error in pattern");
102 
103  std::string tmp_name =
104  pattern.substr (new_pos+3, tmp_pos-new_pos-3);
105 
106  bool found = false;
107 
108  for (int i = 0; i < nnames; i++)
109  {
110  if (named_pats(i) == tmp_name)
111  {
112  named_idx.resize (dim_vector (inames+1, 1));
113  named_idx(inames) = i;
114  found = true;
115  break;
116  }
117  }
118 
119  if (! found)
120  {
121  named_idx.resize (dim_vector (inames+1, 1));
122  named_idx(inames) = nnames;
123  named_pats.append (tmp_name);
124  nnames++;
125  }
126 
127  if (new_pos - pos > 0)
128  buf << pattern.substr (pos, new_pos-pos);
129  if (inames < 10)
130  buf << "(?P<n00" << inames++;
131  else if (inames < 100)
132  buf << "(?P<n0" << inames++;
133  else
134  buf << "(?P<n" << inames++;
135 
136  pos = tmp_pos;
137  }
138  else if (pattern.at (new_pos + 2) == '<')
139  {
140  // Find lookbehind operators of arbitrary length (ie like
141  // "(?<=[a-z]*)") and replace with a maximum length operator
142  // as PCRE can not yet handle arbitrary length lookahead
143  // operators. Use the string length as the maximum length to
144  // avoid issues.
145 
146  int brackets = 1;
147  size_t tmp_pos1 = new_pos + 2;
148  size_t tmp_pos2 = tmp_pos1;
149 
150  while (tmp_pos1 < pattern.length () && brackets > 0)
151  {
152  char ch = pattern.at (tmp_pos1);
153 
154  if (ch == '(')
155  brackets++;
156  else if (ch == ')')
157  {
158  if (brackets > 1)
159  tmp_pos2 = tmp_pos1;
160 
161  brackets--;
162  }
163 
164  tmp_pos1++;
165  }
166 
167  if (brackets != 0)
168  {
169  buf << pattern.substr (pos, new_pos - pos) << "(?";
170  pos = new_pos + 2;
171  }
172  else
173  {
174  size_t tmp_pos3 = pattern.find_first_of ("*+", tmp_pos2);
175 
176  if (tmp_pos3 != std::string::npos && tmp_pos3 < tmp_pos1)
177  {
178  if (! lookbehind_warned)
179  {
180  lookbehind_warned = true;
181  (*current_liboctave_warning_with_id_handler)
182  ("Octave:regexp-lookbehind-limit",
183  "%s: arbitrary length lookbehind patterns are only supported up to length %d",
184  who.c_str (), MAXLOOKBEHIND);
185  }
186 
187  buf << pattern.substr (pos, new_pos - pos) << "(";
188 
189  size_t i;
190 
191  if (pattern.at (tmp_pos3) == '*')
192  i = 0;
193  else
194  i = 1;
195 
196  for (; i < max_length + 1; i++)
197  {
198  buf << pattern.substr (new_pos, tmp_pos3 - new_pos)
199  << "{" << i << "}";
200  buf << pattern.substr (tmp_pos3 + 1,
201  tmp_pos1 - tmp_pos3 - 1);
202  if (i != max_length)
203  buf << "|";
204  }
205  buf << ")";
206  }
207  else
208  buf << pattern.substr (pos, tmp_pos1 - pos);
209 
210  pos = tmp_pos1;
211  }
212  }
213  else
214  {
215  buf << pattern.substr (pos, new_pos - pos) << "(?";
216  pos = new_pos + 2;
217  }
218 
219  }
220 
221  buf << pattern.substr (pos);
222 
223  const char *err;
224  int erroffset;
225  std::string buf_str = buf.str ();
226 
227  int pcre_options
228  = ((options.case_insensitive () ? PCRE_CASELESS : 0)
229  | (options.dotexceptnewline () ? 0 : PCRE_DOTALL)
230  | (options.lineanchors () ? PCRE_MULTILINE : 0)
231  | (options.freespacing () ? PCRE_EXTENDED : 0));
232 
233  data = pcre_compile (buf_str.c_str (), pcre_options, &err, &erroffset, 0);
234 
235  if (! data)
236  (*current_liboctave_error_handler)
237  ("%s: %s at position %d of expression", who.c_str (), err, erroffset);
238  }
239 
241  regexp::match (const std::string& buffer)
242  {
244 
245  std::list<regexp::match_element> lst;
246 
247  int subpatterns;
248  int namecount;
249  int nameentrysize;
250  char *nametable;
251  size_t idx = 0;
252 
253  pcre *re = static_cast<pcre *> (data);
254 
255  pcre_fullinfo (re, 0, PCRE_INFO_CAPTURECOUNT, &subpatterns);
256  pcre_fullinfo (re, 0, PCRE_INFO_NAMECOUNT, &namecount);
257  pcre_fullinfo (re, 0, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize);
258  pcre_fullinfo (re, 0, PCRE_INFO_NAMETABLE, &nametable);
259 
260  OCTAVE_LOCAL_BUFFER (int, ovector, (subpatterns+1)*3);
261  OCTAVE_LOCAL_BUFFER (int, nidx, namecount);
262 
263  for (int i = 0; i < namecount; i++)
264  {
265  // Index of subpattern in first two bytes MSB first of name.
266  // Extract index.
267  nidx[i] = (static_cast<int> (nametable[i*nameentrysize])) << 8
268  | static_cast<int> (nametable[i*nameentrysize+1]);
269  }
270 
271  while (true)
272  {
273  OCTAVE_QUIT;
274 
275  int matches = pcre_exec (re, 0, buffer.c_str (),
276  buffer.length (), idx,
277  (idx ? PCRE_NOTBOL : 0),
278  ovector, (subpatterns+1)*3);
279 
280  if (matches == PCRE_ERROR_MATCHLIMIT)
281  {
282  // Try harder; start with default value for MATCH_LIMIT
283  // and increase it.
284  (*current_liboctave_warning_with_id_handler)
285  ("Octave:regexp-match-limit",
286  "your pattern caused PCRE to hit its MATCH_LIMIT; trying harder now, but this will be slow");
287 
288  pcre_extra pe;
289 
290  pcre_config (PCRE_CONFIG_MATCH_LIMIT,
291  static_cast<void *> (&pe.match_limit));
292 
293  pe.flags = PCRE_EXTRA_MATCH_LIMIT;
294 
295  int i = 0;
296  while (matches == PCRE_ERROR_MATCHLIMIT
297  && i++ < PCRE_MATCHLIMIT_MAX)
298  {
299  OCTAVE_QUIT;
300 
301  pe.match_limit *= 10;
302  matches = pcre_exec (re, &pe, buffer.c_str (),
303  buffer.length (), idx,
304  (idx ? PCRE_NOTBOL : 0),
305  ovector, (subpatterns+1)*3);
306  }
307  }
308 
309  if (matches < 0 && matches != PCRE_ERROR_NOMATCH)
310  (*current_liboctave_error_handler)
311  ("%s: internal error calling pcre_exec; "
312  "error code from pcre_exec is %i", who.c_str (), matches);
313 
314  if (matches == PCRE_ERROR_NOMATCH)
315  break;
316  else if (ovector[1] <= ovector[0] && ! options.emptymatch ())
317  {
318  // Zero length match. Skip to next char.
319  idx = ovector[0] + 1;
320  if (idx < buffer.length ())
321  continue;
322  else
323  break;
324  }
325  else
326  {
327  int pos_match = 0;
328  Matrix token_extents (matches-1, 2);
329 
330  for (int i = 1; i < matches; i++)
331  {
332  if (ovector[2*i] >= 0 && ovector[2*i+1] > 0
333  && (i == 1 || ovector[2*i] != ovector[2*i-2]
334  || ovector[2*i-1] != ovector[2*i+1]))
335  {
336  token_extents(pos_match,0) = double (ovector[2*i]+1);
337  token_extents(pos_match++,1) = double (ovector[2*i+1]);
338  }
339  }
340 
341  token_extents.resize (pos_match, 2);
342 
343  double start = double (ovector[0]+1);
344  double end = double (ovector[1]);
345 
346  const char **listptr;
347  int status = pcre_get_substring_list (buffer.c_str (), ovector,
348  matches, &listptr);
349 
350  if (status == PCRE_ERROR_NOMEMORY)
351  (*current_liboctave_error_handler)
352  ("%s: cannot allocate memory in pcre_get_substring_list",
353  who.c_str ());
354 
355  string_vector tokens (pos_match);
356  string_vector named_tokens (nnames);
357  int pos_offset = 0;
358  pos_match = 0;
359 
360  for (int i = 1; i < matches; i++)
361  {
362  if (ovector[2*i] >= 0 && ovector[2*i+1] > 0)
363  {
364  if (i == 1 || ovector[2*i] != ovector[2*i-2]
365  || ovector[2*i-1] != ovector[2*i+1])
366  {
367  if (namecount > 0)
368  {
369  // FIXME: Should probably do this with a map()
370  // rather than a linear search. However,
371  // the number of captured, named expressions
372  // is usually pretty small (< 4)
373  for (int j = 0; j < namecount; j++)
374  {
375  if (nidx[j] == i)
376  {
377  named_tokens(named_idx(j)) =
378  std::string (*(listptr+i-pos_offset));
379  break;
380  }
381  }
382  }
383 
384  tokens(pos_match++) = std::string (*(listptr+i));
385  }
386  else
387  pos_offset++;
388  }
389  }
390 
391  std::string match_string = std::string (*listptr);
392 
393  pcre_free_substring_list (listptr);
394 
395  regexp::match_element new_elem (named_tokens, tokens, match_string,
396  token_extents, start, end);
397  lst.push_back (new_elem);
398 
399  if (ovector[1] <= ovector[0])
400  {
401  // Zero length match. Skip to next char.
402  idx = ovector[0] + 1;
403  if (idx <= buffer.length ())
404  continue;
405  }
406  else
407  idx = ovector[1];
408 
409  if (options.once () || idx >= buffer.length ())
410  break;
411  }
412  }
413 
414  retval = regexp::match_data (lst, named_pats);
415 
416  return retval;
417  }
418 
419  bool
421  {
422  regexp::match_data rx_lst = match (buffer);
423 
424  return rx_lst.size () > 0;
425  }
426 
429  {
430  octave_idx_type len = buffer.numel ();
431 
432  Array<bool> retval (dim_vector (len, 1));
433 
434  for (octave_idx_type i = 0; i < buffer.numel (); i++)
435  retval(i) = is_match (buffer(i));
436 
437  return retval;
438  }
439 
440  // Declare rep_token_t used in processing replacement string
441  typedef struct
442  {
443  size_t pos;
444  int num;
445  } rep_token_t;
446 
448  regexp::replace (const std::string& buffer, const std::string& replacement)
449  {
451 
452  regexp::match_data rx_lst = match (buffer);
453 
454  size_t num_matches = rx_lst.size ();
455 
456  if (num_matches == 0)
457  {
458  retval = buffer;
459  return retval;
460  }
461 
462  // Identify replacement tokens; build a vector of group numbers in
463  // the replacement string so that we can quickly calculate the size
464  // of the replacement.
465 
466  // FIXME: All code assumes that only 10 tokens ($0-$9) exist.
467  // $11 represents $1 followed by the character '1' rather than
468  // the eleventh capture buffer.
469 
470  std::string repstr = replacement;
471  std::vector<rep_token_t> tokens;
472  tokens.reserve (5); // Reserve memory for 5 pattern replacements
473 
474  for (size_t i=0; i < repstr.size (); i++)
475  {
476  if (repstr[i] == '\\')
477  {
478  if (i < repstr.size () - 1 && repstr[i+1] == '$')
479  {
480  repstr.erase (i,1); // erase backslash
481  i++; // skip over '$'
482  continue;
483  }
484  if (i < repstr.size () - 1 && repstr[i+1] == '\\')
485  {
486  repstr.erase (i,1); // erase 1st backslash
487  continue;
488  }
489  }
490  else if (repstr[i] == '$')
491  {
492  if (i < repstr.size () - 1 && isdigit (repstr[i+1]))
493  {
494  rep_token_t tmp_token;
495 
496  tmp_token.pos = i;
497  tmp_token.num = repstr[i+1]-'0';
498  tokens.push_back (tmp_token);
499  }
500  }
501  }
502 
503  std::string rep;
504  int num_tokens = tokens.size ();
505 
506  if (num_tokens > 0)
507  {
508  // Determine replacement length
509  const size_t replen = repstr.size () - 2*num_tokens;
510  int delta = 0;
512  for (size_t i = 0; i < num_matches; i++)
513  {
514  OCTAVE_QUIT;
515 
516  double start = p->start ();
517  double end = p->end ();
518 
519  const Matrix pairs (p->token_extents ());
520  size_t pairlen = 0;
521  for (int j = 0; j < num_tokens; j++)
522  {
523  if (tokens[j].num == 0)
524  pairlen += static_cast<size_t> (end - start) + 1;
525  else if (tokens[j].num <= pairs.rows ())
526  pairlen += static_cast<size_t> (pairs(tokens[j].num-1,1)
527  - pairs(tokens[j].num-1,0)) + 1;
528  }
529  delta += (static_cast<int> (replen + pairlen)
530  - static_cast<int> (end - start + 1));
531  p++;
532  }
533 
534  // Build replacement string
535  rep.reserve (buffer.size () + delta);
536  size_t from = 0;
537  p = rx_lst.begin ();
538  for (size_t i = 0; i < num_matches; i++)
539  {
540  OCTAVE_QUIT;
541 
542  double start = p->start ();
543  double end = p->end ();
544 
545  const Matrix pairs (p->token_extents ());
546  rep.append (&buffer[from], static_cast<size_t> (start - 1) - from);
547  from = static_cast<size_t> (end);
548 
549  size_t cur_pos = 0;
550 
551  for (int j = 0; j < num_tokens; j++)
552  {
553  rep.append (&repstr[cur_pos], (tokens[j].pos) - cur_pos);
554  cur_pos = tokens[j].pos+2;
555 
556  int k = tokens[j].num;
557  if (k == 0)
558  {
559  // replace with entire match
560  rep.append (&buffer[static_cast<size_t> (end - 1)],
561  static_cast<size_t> (end - start) + 1);
562  }
563  else if (k <= pairs.rows ())
564  {
565  // replace with group capture
566  rep.append (&buffer[static_cast<size_t> (pairs(k-1,0)-1)],
567  static_cast<size_t> (pairs(k-1,1)
568  - pairs(k-1,0)) + 1);
569  }
570  else
571  {
572  // replace with nothing
573  }
574  }
575  if (cur_pos < repstr.size ())
576  rep.append (&repstr[cur_pos], repstr.size () - cur_pos);
577 
578  p++;
579  }
580  rep.append (&buffer[from], buffer.size () - from);
581  }
582  else
583  {
584  // Determine repstr length
585  const size_t replen = repstr.size ();
586  int delta = 0;
588  for (size_t i = 0; i < num_matches; i++)
589  {
590  OCTAVE_QUIT;
591  delta += static_cast<int> (replen)
592  - static_cast<int> (p->end () - p->start () + 1);
593  p++;
594  }
595 
596  // Build replacement string
597  rep.reserve (buffer.size () + delta);
598  size_t from = 0;
599  p = rx_lst.begin ();
600  for (size_t i = 0; i < num_matches; i++)
601  {
602  OCTAVE_QUIT;
603  rep.append (&buffer[from],
604  static_cast<size_t> (p->start () - 1) - from);
605  from = static_cast<size_t> (p->end ());
606  rep.append (repstr);
607  p++;
608  }
609  rep.append (&buffer[from], buffer.size () - from);
610  }
611 
612  retval = rep;
613  return retval;
614  }
615 }
std::string who
Definition: lo-regexp.h:282
Octave interface to the compression and uncompression libraries.
Definition: aepbalance.cc:47
void resize(octave_idx_type nr, octave_idx_type nc, double rfv=0)
Definition: dMatrix.h:145
octave_idx_type numel(void) const
Number of elements in the array.
Definition: Array.h:363
for large enough k
Definition: lu.cc:606
OCTAVE_EXPORT octave_value_list search each directory of the loadpath for element of the cell array and return the first that matches If the second optional argument return a cell array containing the list of all files that have the same name in the path If no files are found
Definition: utils.cc:302
void dotexceptnewline(bool val)
Definition: lo-regexp.h:173
string_vector & append(const std::string &s)
Definition: str-vec.cc:107
void resize(const dim_vector &dv, const T &rfv)
Definition: Array.cc:1028
octave_value retval
Definition: data.cc:6294
std::string replace(const std::string &buffer, const std::string &replacement)
Definition: lo-regexp.cc:448
Definition: dMatrix.h:37
bool is_match(const std::string &buffer)
Definition: lo-regexp.cc:420
Array< int > named_idx
Definition: lo-regexp.h:281
#define MAXLOOKBEHIND
Definition: lo-regexp.cc:53
match_data match(const std::string &buffer)
Definition: lo-regexp.cc:241
std::list< match_element >::const_iterator const_iterator
Definition: base-list.h:41
octave::sys::time start
Definition: graphics.cc:11731
void emptymatch(bool val)
Definition: lo-regexp.h:174
=val(i)}if ode{val(i)}occurs in table i
Definition: lookup.cc:239
size_t size(void) const
Definition: base-list.h:49
p
Definition: lu.cc:138
void compile_internal(void)
Definition: lo-regexp.cc:70
issues an error eealso double
Definition: ov-bool-mat.cc:594
string_vector named_pats
Definition: lo-regexp.h:279
#define OCTAVE_LOCAL_BUFFER(T, buf, size)
Definition: oct-locbuf.h:200
void free(void)
Definition: lo-regexp.cc:63
#define PCRE_MATCHLIMIT_MAX
Definition: lo-regexp.cc:50
OCTAVE_EXPORT octave_value_list error nd deftypefn *const octave_scalar_map err
Definition: error.cc:1036
void case_insensitive(bool val)
Definition: lo-regexp.h:172
void lineanchors(bool val)
Definition: lo-regexp.h:176
#define OCTAVE_QUIT
Definition: quit.h:212
Vector representing the dimensions (size) of an Array.
Definition: dim-vector.h:87
std::string pattern
Definition: lo-regexp.h:271
If this string is the system will ring the terminal sometimes it is useful to be able to print the original representation of the string
Definition: utils.cc:854
static bool lookbehind_warned
Definition: lo-regexp.cc:55
iterator begin(void)
Definition: base-list.h:83
Matrix append(const Matrix &a) const
Definition: dMatrix.cc:266
void once(bool val)
Definition: lo-regexp.h:177
void freespacing(bool val)
Definition: lo-regexp.h:175