GNU Octave  4.4.1
A high-level interpreted language, primarily intended for numerical computations, mostly compatible with Matlab
lo-regexp.cc
Go to the documentation of this file.
1 /*
2 
3 Copyright (C) 2005-2018 David Bateman
4 Copyright (C) 2002-2005 Paul Kienzle
5 Copyright (C) 2012 John W. Eaton
6 
7 This file is part of Octave.
8 
9 Octave is free software: you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by
11 the Free Software Foundation, either version 3 of the License, or
12 (at your option) any later version.
13 
14 Octave is distributed in the hope that it will be useful, but
15 WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18 
19 You should have received a copy of the GNU General Public License
20 along with Octave; see the file COPYING. If not, see
21 <https://www.gnu.org/licenses/>.
22 
23 */
24 
25 #if defined (HAVE_CONFIG_H)
26 # include "config.h"
27 #endif
28 
29 #include <list>
30 #include <sstream>
31 #include <string>
32 #include <vector>
33 
34 #if defined (HAVE_PCRE_H)
35 # include <pcre.h>
36 #elif defined (HAVE_PCRE_PCRE_H)
37 # include <pcre/pcre.h>
38 #endif
39 
40 #include "Matrix.h"
41 #include "base-list.h"
42 #include "lo-error.h"
43 #include "oct-locbuf.h"
44 #include "quit.h"
45 #include "lo-regexp.h"
46 #include "str-vec.h"
47 
48 // Define the maximum number of retries for a pattern
49 // that possibly results in an infinite recursion.
50 #define PCRE_MATCHLIMIT_MAX 10
51 
52 // FIXME: should this be configurable?
53 #define MAXLOOKBEHIND 10
54 
55 static bool lookbehind_warned = false;
56 
57 // FIXME: don't bother collecting and composing return values
58 // the user doesn't want.
59 
60 namespace octave
61 {
62  void
63  regexp::free (void)
64  {
65  if (data)
66  pcre_free (static_cast<pcre *> (data));
67  }
68 
69  void
71  {
72  // If we had a previously compiled pattern, release it.
73  free ();
74 
75  size_t max_length = MAXLOOKBEHIND;
76 
77  size_t pos = 0;
78  size_t new_pos;
79  int inames = 0;
80  std::ostringstream buf;
81 
82  while ((new_pos = pattern.find ("(?", pos)) != std::string::npos)
83  {
84  if (pattern.at (new_pos + 2) == '<'
85  && !(pattern.at (new_pos + 3) == '='
86  || pattern.at (new_pos + 3) == '!'))
87  {
88  // The syntax of named tokens in pcre is "(?P<name>...)" while
89  // we need a syntax "(?<name>...)", so fix that here. Also an
90  // expression like
91  // "(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)"
92  // should be perfectly legal, while pcre does not allow the same
93  // named token name on both sides of the alternative. Also fix
94  // that here by replacing name tokens by dummy names, and dealing
95  // with the dummy names later.
96 
97  size_t tmp_pos = pattern.find_first_of ('>', new_pos);
98 
99  if (tmp_pos == std::string::npos)
100  (*current_liboctave_error_handler)
101  ("regexp: syntax error in pattern");
102 
103  std::string tmp_name =
104  pattern.substr (new_pos+3, tmp_pos-new_pos-3);
105 
106  bool found = false;
107 
108  for (int i = 0; i < nnames; i++)
109  {
110  if (named_pats(i) == tmp_name)
111  {
112  named_idx.resize (dim_vector (inames+1, 1));
113  named_idx(inames) = i;
114  found = true;
115  break;
116  }
117  }
118 
119  if (! found)
120  {
121  named_idx.resize (dim_vector (inames+1, 1));
122  named_idx(inames) = nnames;
123  named_pats.append (tmp_name);
124  nnames++;
125  }
126 
127  if (new_pos - pos > 0)
128  buf << pattern.substr (pos, new_pos-pos);
129  if (inames < 10)
130  buf << "(?P<n00" << inames++;
131  else if (inames < 100)
132  buf << "(?P<n0" << inames++;
133  else
134  buf << "(?P<n" << inames++;
135 
136  pos = tmp_pos;
137  }
138  else if (pattern.at (new_pos + 2) == '<')
139  {
140  // Find lookbehind operators of arbitrary length (ie like
141  // "(?<=[a-z]*)") and replace with a maximum length operator
142  // as PCRE can not yet handle arbitrary length lookahead
143  // operators. Use the string length as the maximum length to
144  // avoid issues.
145 
146  int brackets = 1;
147  size_t tmp_pos1 = new_pos + 2;
148  size_t tmp_pos2 = tmp_pos1;
149 
150  while (tmp_pos1 < pattern.length () && brackets > 0)
151  {
152  char ch = pattern.at (tmp_pos1);
153 
154  if (ch == '(')
155  brackets++;
156  else if (ch == ')')
157  {
158  if (brackets > 1)
159  tmp_pos2 = tmp_pos1;
160 
161  brackets--;
162  }
163 
164  tmp_pos1++;
165  }
166 
167  if (brackets != 0)
168  {
169  buf << pattern.substr (pos, new_pos - pos) << "(?";
170  pos = new_pos + 2;
171  }
172  else
173  {
174  size_t tmp_pos3 = pattern.find_first_of ("*+", tmp_pos2);
175 
176  if (tmp_pos3 != std::string::npos && tmp_pos3 < tmp_pos1)
177  {
178  if (! lookbehind_warned)
179  {
180  lookbehind_warned = true;
181  (*current_liboctave_warning_with_id_handler)
182  ("Octave:regexp-lookbehind-limit",
183  "%s: arbitrary length lookbehind patterns are only supported up to length %d",
184  who.c_str (), MAXLOOKBEHIND);
185  }
186 
187  buf << pattern.substr (pos, new_pos - pos) << '(';
188 
189  size_t i;
190 
191  if (pattern.at (tmp_pos3) == '*')
192  i = 0;
193  else
194  i = 1;
195 
196  for (; i < max_length + 1; i++)
197  {
198  buf << pattern.substr (new_pos, tmp_pos3 - new_pos)
199  << '{' << i << '}';
200  buf << pattern.substr (tmp_pos3 + 1,
201  tmp_pos1 - tmp_pos3 - 1);
202  if (i != max_length)
203  buf << '|';
204  }
205  buf << ')';
206  }
207  else
208  buf << pattern.substr (pos, tmp_pos1 - pos);
209 
210  pos = tmp_pos1;
211  }
212  }
213  else
214  {
215  buf << pattern.substr (pos, new_pos - pos) << "(?";
216  pos = new_pos + 2;
217  }
218 
219  }
220 
221  buf << pattern.substr (pos);
222 
223  // Replace NULLs with escape sequence because conversion function c_str()
224  // will terminate string early at embedded NULLs.
225  std::string buf_str = buf.str ();
226  while ((pos = buf_str.find ('\0')) != std::string::npos)
227  buf_str.replace (pos, 1, "\\000");
228 
229  const char *err;
230  int erroffset;
231 
232  int pcre_options
233  = ( (options.case_insensitive () ? PCRE_CASELESS : 0)
234  | (options.dotexceptnewline () ? 0 : PCRE_DOTALL)
235  | (options.lineanchors () ? PCRE_MULTILINE : 0)
236  | (options.freespacing () ? PCRE_EXTENDED : 0));
237 
238  data = pcre_compile (buf_str.c_str (), pcre_options,
239  &err, &erroffset, nullptr);
240 
241  if (! data)
242  (*current_liboctave_error_handler)
243  ("%s: %s at position %d of expression", who.c_str (), err, erroffset);
244  }
245 
247  regexp::match (const std::string& buffer)
248  {
250 
251  std::list<regexp::match_element> lst;
252 
253  int subpatterns;
254  int namecount;
255  int nameentrysize;
256  char *nametable;
257  size_t idx = 0;
258 
259  pcre *re = static_cast<pcre *> (data);
260 
261  pcre_fullinfo (re, nullptr, PCRE_INFO_CAPTURECOUNT, &subpatterns);
262  pcre_fullinfo (re, nullptr, PCRE_INFO_NAMECOUNT, &namecount);
263  pcre_fullinfo (re, nullptr, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize);
264  pcre_fullinfo (re, nullptr, PCRE_INFO_NAMETABLE, &nametable);
265 
266  OCTAVE_LOCAL_BUFFER (int, ovector, (subpatterns+1)*3);
267  OCTAVE_LOCAL_BUFFER (int, nidx, namecount);
268 
269  for (int i = 0; i < namecount; i++)
270  {
271  // Index of subpattern in first two bytes of name (MSB first).
272  // Extract index.
273  nidx[i] = (static_cast<int> (nametable[i*nameentrysize])) << 8
274  | static_cast<int> (nametable[i*nameentrysize+1]);
275  }
276 
277  while (true)
278  {
279  octave_quit ();
280 
281  int matches = pcre_exec (re, nullptr, buffer.c_str (),
282  buffer.length (), idx,
283  (idx ? PCRE_NOTBOL : 0),
284  ovector, (subpatterns+1)*3);
285 
286  if (matches == PCRE_ERROR_MATCHLIMIT)
287  {
288  // Try harder; start with default value for MATCH_LIMIT
289  // and increase it.
290  (*current_liboctave_warning_with_id_handler)
291  ("Octave:regexp-match-limit",
292  "your pattern caused PCRE to hit its MATCH_LIMIT; trying harder now, but this will be slow");
293 
294  pcre_extra pe;
295 
296  pcre_config (PCRE_CONFIG_MATCH_LIMIT,
297  static_cast<void *> (&pe.match_limit));
298 
299  pe.flags = PCRE_EXTRA_MATCH_LIMIT;
300 
301  int i = 0;
302  while (matches == PCRE_ERROR_MATCHLIMIT
303  && i++ < PCRE_MATCHLIMIT_MAX)
304  {
305  octave_quit ();
306 
307  pe.match_limit *= 10;
308  matches = pcre_exec (re, &pe, buffer.c_str (),
309  buffer.length (), idx,
310  (idx ? PCRE_NOTBOL : 0),
311  ovector, (subpatterns+1)*3);
312  }
313  }
314 
315  if (matches < 0 && matches != PCRE_ERROR_NOMATCH)
316  (*current_liboctave_error_handler)
317  ("%s: internal error calling pcre_exec; "
318  "error code from pcre_exec is %i", who.c_str (), matches);
319 
320  if (matches == PCRE_ERROR_NOMATCH)
321  break;
322  else if (ovector[0] >= ovector[1] && ! options.emptymatch ())
323  {
324  // Zero length match. Skip to next char.
325  idx = ovector[0] + 1;
326  if (idx < buffer.length ())
327  continue;
328  else
329  break;
330  }
331  else
332  {
333  int pos_match = 0;
334  Matrix token_extents (matches-1, 2);
335 
336  for (int i = 1; i < matches; i++)
337  {
338  if (ovector[2*i] >= 0 && ovector[2*i+1] > 0
339  && (i == 1 || ovector[2*i] != ovector[2*i-2]
340  || ovector[2*i-1] != ovector[2*i+1]))
341  {
342  token_extents(pos_match,0) = double (ovector[2*i]+1);
343  token_extents(pos_match++,1) = double (ovector[2*i+1]);
344  }
345  }
346 
347  token_extents.resize (pos_match, 2);
348 
349  double start = double (ovector[0]+1);
350  double end = double (ovector[1]);
351 
352  const char **listptr;
353  int status = pcre_get_substring_list (buffer.c_str (), ovector,
354  matches, &listptr);
355 
356  if (status == PCRE_ERROR_NOMEMORY)
357  (*current_liboctave_error_handler)
358  ("%s: cannot allocate memory in pcre_get_substring_list",
359  who.c_str ());
360 
361  // Must use explicit length constructor as match can contain '\0'.
362  std::string match_string = std::string (*listptr, end - start + 1);
363 
364  string_vector tokens (pos_match);
365  string_vector named_tokens (nnames);
366  int pos_offset = 0;
367  pos_match = 0;
368 
369  for (int i = 1; i < matches; i++)
370  {
371  if (ovector[2*i] >= 0 && ovector[2*i+1] > 0)
372  {
373  if (i == 1 || ovector[2*i] != ovector[2*i-2]
374  || ovector[2*i-1] != ovector[2*i+1])
375  {
376  if (namecount > 0)
377  {
378  // FIXME: Should probably do this with a map()
379  // rather than a linear search. However,
380  // the number of captured, named expressions
381  // is usually pretty small (< 4)
382  for (int j = 0; j < namecount; j++)
383  {
384  if (nidx[j] == i)
385  {
386  size_t len = ovector[2*i+1] - ovector[2*i];
387  named_tokens(named_idx(j)) =
388  std::string (*(listptr+i-pos_offset),
389  len);
390  break;
391  }
392  }
393  }
394 
395  size_t len = ovector[2*i+1] - ovector[2*i];
396  tokens(pos_match++) = std::string (*(listptr+i), len);
397  }
398  else
399  pos_offset++;
400  }
401  }
402 
403  pcre_free_substring_list (listptr);
404 
405  regexp::match_element new_elem (named_tokens, tokens, match_string,
406  token_extents, start, end);
407  lst.push_back (new_elem);
408 
409  if (ovector[1] <= ovector[0])
410  {
411  // Zero length match. Skip to next char.
412  idx = ovector[0] + 1;
413  if (idx <= buffer.length ())
414  continue;
415  }
416  else
417  idx = ovector[1];
418 
419  if (options.once () || idx >= buffer.length ())
420  break;
421  }
422  }
423 
425 
426  return retval;
427  }
428 
429  bool
431  {
432  regexp::match_data rx_lst = match (buffer);
433 
434  return rx_lst.size () > 0;
435  }
436 
439  {
440  octave_idx_type len = buffer.numel ();
441 
442  Array<bool> retval (dim_vector (len, 1));
443 
444  for (octave_idx_type i = 0; i < buffer.numel (); i++)
445  retval(i) = is_match (buffer(i));
446 
447  return retval;
448  }
449 
450  // Declare rep_token_t used in processing replacement string
451  typedef struct
452  {
453  size_t pos;
454  int num;
455  } rep_token_t;
456 
458  regexp::replace (const std::string& buffer, const std::string& replacement)
459  {
461 
462  regexp::match_data rx_lst = match (buffer);
463 
464  size_t num_matches = rx_lst.size ();
465 
466  if (num_matches == 0)
467  {
468  retval = buffer;
469  return retval;
470  }
471 
472  // Identify replacement tokens; build a vector of group numbers in
473  // the replacement string so that we can quickly calculate the size
474  // of the replacement.
475 
476  // FIXME: All code assumes that only 10 tokens ($0-$9) exist.
477  // $11 represents $1 followed by the character '1' rather than
478  // the eleventh capture buffer.
479 
480  std::string repstr = replacement;
481  std::vector<rep_token_t> tokens;
482  tokens.reserve (5); // Reserve memory for 5 pattern replacements
483 
484  for (size_t i=0; i < repstr.size (); i++)
485  {
486  if (repstr[i] == '\\')
487  {
488  if (i < repstr.size () - 1 && repstr[i+1] == '$')
489  {
490  repstr.erase (i,1); // erase backslash
491  i++; // skip over '$'
492  continue;
493  }
494  if (i < repstr.size () - 1 && repstr[i+1] == '\\')
495  {
496  repstr.erase (i,1); // erase 1st backslash
497  continue;
498  }
499  }
500  else if (repstr[i] == '$')
501  {
502  if (i < repstr.size () - 1 && isdigit (repstr[i+1]))
503  {
504  rep_token_t tmp_token;
505 
506  tmp_token.pos = i;
507  tmp_token.num = repstr[i+1]-'0';
508  tokens.push_back (tmp_token);
509  }
510  }
511  }
512 
513  std::string rep;
514  int num_tokens = tokens.size ();
515 
516  if (num_tokens > 0)
517  {
518  // Determine replacement length
519  const size_t replen = repstr.size () - 2*num_tokens;
520  int delta = 0;
522  for (size_t i = 0; i < num_matches; i++)
523  {
524  octave_quit ();
525 
526  double start = p->start ();
527  double end = p->end ();
528 
529  const Matrix pairs (p->token_extents ());
530  size_t pairlen = 0;
531  for (int j = 0; j < num_tokens; j++)
532  {
533  if (tokens[j].num == 0)
534  pairlen += static_cast<size_t> (end - start + 1);
535  else if (tokens[j].num <= pairs.rows ())
536  pairlen += static_cast<size_t> (pairs(tokens[j].num-1,1)
537  - pairs(tokens[j].num-1,0)
538  + 1);
539  }
540  delta += (static_cast<int> (replen + pairlen)
541  - static_cast<int> (end - start + 1));
542  p++;
543  }
544 
545  // Build replacement string
546  rep.reserve (buffer.size () + delta);
547  size_t from = 0;
548  p = rx_lst.begin ();
549  for (size_t i = 0; i < num_matches; i++)
550  {
551  octave_quit ();
552 
553  double start = p->start ();
554  double end = p->end ();
555 
556  const Matrix pairs (p->token_extents ());
557  rep.append (&buffer[from], static_cast<size_t> (start - 1 - from));
558  from = static_cast<size_t> (end);
559 
560  size_t cur_pos = 0;
561 
562  for (int j = 0; j < num_tokens; j++)
563  {
564  rep.append (&repstr[cur_pos], (tokens[j].pos) - cur_pos);
565  cur_pos = tokens[j].pos+2;
566 
567  int k = tokens[j].num;
568  if (k == 0)
569  {
570  // replace with entire match
571  rep.append (&buffer[static_cast<size_t> (end - 1)],
572  static_cast<size_t> (end - start + 1));
573  }
574  else if (k <= pairs.rows ())
575  {
576  // replace with group capture
577  rep.append (&buffer[static_cast<size_t> (pairs(k-1,0)-1)],
578  static_cast<size_t> (pairs(k-1,1)
579  - pairs(k-1,0) + 1));
580  }
581  else
582  {
583  // replace with nothing
584  }
585  }
586  if (cur_pos < repstr.size ())
587  rep.append (&repstr[cur_pos], repstr.size () - cur_pos);
588 
589  p++;
590  }
591  rep.append (&buffer[from], buffer.size () - from);
592  }
593  else
594  {
595  // Determine repstr length
596  const size_t replen = repstr.size ();
597  int delta = 0;
599  for (size_t i = 0; i < num_matches; i++)
600  {
601  octave_quit ();
602 
603  delta += static_cast<int> (replen)
604  - static_cast<int> (p->end () - p->start () + 1);
605  p++;
606  }
607 
608  // Build replacement string
609  rep.reserve (buffer.size () + delta);
610  size_t from = 0;
611  p = rx_lst.begin ();
612  for (size_t i = 0; i < num_matches; i++)
613  {
614  octave_quit ();
615 
616  rep.append (&buffer[from],
617  static_cast<size_t> (p->start () - 1 - from));
618  from = static_cast<size_t> (p->end ());
619  rep.append (repstr);
620  p++;
621  }
622  rep.append (&buffer[from], buffer.size () - from);
623  }
624 
625  retval = rep;
626  return retval;
627  }
628 }
std::string who
Definition: lo-regexp.h:282
is already an absolute the name is checked against the file system instead of Octave s loadpath In this if otherwise an empty string is returned If the first argument is a cell array of search each directory of the loadpath for element of the cell array and return the first that matches If the second optional argument return a cell array containing the list of all files that have the same name in the path If no files are found
Definition: utils.cc:305
void resize(octave_idx_type nr, octave_idx_type nc, double rfv=0)
Definition: dMatrix.h:148
for large enough k
Definition: lu.cc:617
size_t size(void) const
Definition: base-list.h:49
void dotexceptnewline(bool val)
Definition: lo-regexp.h:173
string_vector & append(const std::string &s)
Definition: str-vec.cc:107
void resize(const dim_vector &dv, const T &rfv)
Resizing (with fill).
Definition: Array.cc:1010
Matrix append(const Matrix &a) const
Definition: dMatrix.cc:264
octave_value retval
Definition: data.cc:6246
std::string replace(const std::string &buffer, const std::string &replacement)
Definition: lo-regexp.cc:458
Definition: dMatrix.h:36
bool is_match(const std::string &buffer)
Definition: lo-regexp.cc:430
Array< int > named_idx
Definition: lo-regexp.h:281
#define MAXLOOKBEHIND
Definition: lo-regexp.cc:53
match_data match(const std::string &buffer)
Definition: lo-regexp.cc:247
std::list< match_element >::const_iterator const_iterator
Definition: base-list.h:41
octave::sys::time start
Definition: graphics.cc:12337
void emptymatch(bool val)
Definition: lo-regexp.h:174
p
Definition: lu.cc:138
void compile_internal(void)
Definition: lo-regexp.cc:70
string_vector named_pats
Definition: lo-regexp.h:279
#define OCTAVE_LOCAL_BUFFER(T, buf, size)
Definition: oct-locbuf.h:41
void free(void)
Definition: lo-regexp.cc:63
for i
Definition: data.cc:5264
#define PCRE_MATCHLIMIT_MAX
Definition: lo-regexp.cc:50
OCTAVE_EXPORT octave_value_list error nd deftypefn *const octave_scalar_map err
Definition: error.cc:1049
void case_insensitive(bool val)
Definition: lo-regexp.h:172
octave_idx_type numel(void) const
Number of elements in the array.
Definition: Array.h:366
void lineanchors(bool val)
Definition: lo-regexp.h:176
Vector representing the dimensions (size) of an Array.
Definition: dim-vector.h:87
std::string pattern
Definition: lo-regexp.h:271
If this string is the system will ring the terminal sometimes it is useful to be able to print the original representation of the string
Definition: utils.cc:888
static bool lookbehind_warned
Definition: lo-regexp.cc:55
iterator begin(void)
Definition: base-list.h:83
void once(bool val)
Definition: lo-regexp.h:177
void freespacing(bool val)
Definition: lo-regexp.h:175