regexp.cc

Go to the documentation of this file.
00001 /*
00002 
00003 Copyright (C) 2012 John W. Eaton
00004 Copyright (C) 2005-2012 David Bateman
00005 Copyright (C) 2002-2005 Paul Kienzle
00006 
00007 This file is part of Octave.
00008 
00009 Octave is free software; you can redistribute it and/or modify it
00010 under the terms of the GNU General Public License as published by the
00011 Free Software Foundation; either version 3 of the License, or (at your
00012 option) any later version.
00013 
00014 Octave is distributed in the hope that it will be useful, but WITHOUT
00015 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
00016 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
00017 for more details.
00018 
00019 You should have received a copy of the GNU General Public License
00020 along with Octave; see the file COPYING.  If not, see
00021 <http://www.gnu.org/licenses/>.
00022 
00023 */
00024 
00025 #ifdef HAVE_CONFIG_H
00026 #include <config.h>
00027 #endif
00028 
00029 #include <list>
00030 #include <sstream>
00031 #include <string>
00032 #include <vector>
00033 
00034 #if defined (HAVE_PCRE_H)
00035 #include <pcre.h>
00036 #elif defined (HAVE_PCRE_PCRE_H)
00037 #include <pcre/pcre.h>
00038 #endif
00039 
00040 #include "Matrix.h"
00041 #include "base-list.h"
00042 #include "lo-error.h"
00043 #include "oct-locbuf.h"
00044 #include "quit.h"
00045 #include "regexp.h"
00046 #include "str-vec.h"
00047 
00048 // Define the maximum number of retries for a pattern that possibly
00049 // results in an infinite recursion.
00050 #define PCRE_MATCHLIMIT_MAX 10
00051 
00052 // FIXME -- should this be configurable?
00053 #define MAXLOOKBEHIND 10
00054 
00055 static bool lookbehind_warned = false;
00056 
00057 // FIXME -- don't bother collecting and composing return values the user
00058 // doesn't want.
00059 
00060 void
00061 regexp::free (void)
00062 {
00063   if (data)
00064     pcre_free (static_cast<pcre *> (data));
00065 }
00066 
00067 void
00068 regexp::compile_internal (void)
00069 {
00070   // If we had a previously compiled pattern, release it.
00071   free ();
00072 
00073   size_t max_length = MAXLOOKBEHIND;
00074 
00075   size_t pos = 0;
00076   size_t new_pos;
00077   int inames = 0;
00078   std::ostringstream buf;
00079 
00080   while ((new_pos = pattern.find ("(?", pos)) != std::string::npos)
00081     {
00082       if (pattern.at (new_pos + 2) == '<'
00083           && !(pattern.at (new_pos + 3) == '='
00084                || pattern.at (new_pos + 3) == '!'))
00085         {
00086           // The syntax of named tokens in pcre is "(?P<name>...)" while
00087           // we need a syntax "(?<name>...)", so fix that here. Also an
00088           // expression like
00089           // "(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)"
00090           // should be perfectly legal, while pcre does not allow the same
00091           // named token name on both sides of the alternative. Also fix
00092           // that here by replacing name tokens by dummy names, and dealing
00093           // with the dummy names later.
00094 
00095           size_t tmp_pos = pattern.find_first_of ('>', new_pos);
00096 
00097           if (tmp_pos == std::string::npos)
00098             {
00099               (*current_liboctave_error_handler)
00100                 ("regexp: syntax error in pattern");
00101               return;
00102             }
00103 
00104           std::string tmp_name =
00105             pattern.substr (new_pos+3, tmp_pos-new_pos-3);
00106 
00107           bool found = false;
00108 
00109           for (int i = 0; i < nnames; i++)
00110             {
00111               if (named_pats(i) == tmp_name)
00112                 {
00113                   named_idx.resize (dim_vector (inames+1, 1));
00114                   named_idx(inames) = i;
00115                   found = true;
00116                   break;
00117                 }
00118             }
00119 
00120           if (! found)
00121             {
00122               named_idx.resize (dim_vector (inames+1, 1));
00123               named_idx(inames) = nnames;
00124               named_pats.append (tmp_name);
00125               nnames++;
00126             }
00127 
00128           if (new_pos - pos > 0)
00129             buf << pattern.substr (pos, new_pos-pos);
00130           if (inames < 10)
00131             buf << "(?P<n00" << inames++;
00132           else if (inames < 100)
00133             buf << "(?P<n0" << inames++;
00134           else
00135             buf << "(?P<n" << inames++;
00136 
00137           pos = tmp_pos;
00138         }
00139       else if (pattern.at (new_pos + 2) == '<')
00140         {
00141           // Find lookbehind operators of arbitrary length (ie like
00142           // "(?<=[a-z]*)") and replace with a maximum length operator
00143           // as PCRE can not yet handle arbitrary length lookahead
00144           // operators. Use the string length as the maximum length to
00145           // avoid issues.
00146 
00147           int brackets = 1;
00148           size_t tmp_pos1 = new_pos + 2;
00149           size_t tmp_pos2 = tmp_pos1;
00150 
00151           while (tmp_pos1 < pattern.length () && brackets > 0)
00152             {
00153               char ch = pattern.at (tmp_pos1);
00154 
00155               if (ch == '(')
00156                 brackets++;
00157               else if (ch == ')')
00158                 {
00159                   if (brackets > 1)
00160                     tmp_pos2 = tmp_pos1;
00161 
00162                   brackets--;
00163                 }
00164 
00165               tmp_pos1++;
00166             }
00167 
00168           if (brackets != 0)
00169             {
00170               buf << pattern.substr (pos, new_pos - pos) << "(?";
00171               pos = new_pos + 2;
00172             }
00173           else
00174             {
00175               size_t tmp_pos3 = pattern.find_first_of ("*+", tmp_pos2);
00176 
00177               if (tmp_pos3 != std::string::npos && tmp_pos3 < tmp_pos1)
00178                 {
00179                   if (!lookbehind_warned)
00180                     {
00181                       lookbehind_warned = true;
00182                       (*current_liboctave_warning_handler)
00183                         ("%s: arbitrary length lookbehind patterns are only supported up to length %d",
00184                                who.c_str (), MAXLOOKBEHIND);
00185                     }
00186 
00187                   buf << pattern.substr (pos, new_pos - pos) << "(";
00188 
00189                   size_t i;
00190 
00191                   if (pattern.at (tmp_pos3) == '*')
00192                     i = 0;
00193                   else
00194                     i = 1;
00195 
00196                   for (; i < max_length + 1; i++)
00197                     {
00198                       buf << pattern.substr (new_pos, tmp_pos3 - new_pos)
00199                           << "{" << i << "}";
00200                       buf << pattern.substr (tmp_pos3 + 1,
00201                                              tmp_pos1 - tmp_pos3 - 1);
00202                       if (i != max_length)
00203                         buf << "|";
00204                     }
00205                   buf << ")";
00206                 }
00207               else
00208                 buf << pattern.substr (pos, tmp_pos1 - pos);
00209 
00210               pos = tmp_pos1;
00211             }
00212         }
00213       else
00214         {
00215           buf << pattern.substr (pos, new_pos - pos) << "(?";
00216           pos = new_pos + 2;
00217         }
00218 
00219     }
00220 
00221   buf << pattern.substr (pos);
00222 
00223   const char *err;
00224   int erroffset;
00225   std::string buf_str = buf.str ();
00226 
00227   int pcre_options
00228     = ((options.case_insensitive () ? PCRE_CASELESS : 0)
00229        | (options.dotexceptnewline () ? 0 : PCRE_DOTALL)
00230        | (options.lineanchors () ? PCRE_MULTILINE : 0)
00231        | (options.freespacing () ? PCRE_EXTENDED : 0));
00232 
00233   data = pcre_compile (buf_str.c_str (), pcre_options, &err, &erroffset, 0);
00234 
00235   if (! data)
00236     (*current_liboctave_error_handler)
00237       ("%s: %s at position %d of expression", who.c_str (),
00238        err, erroffset);
00239 }
00240 
00241 regexp::match_data
00242 regexp::match (const std::string& buffer)
00243 {
00244   regexp::match_data retval;
00245 
00246   std::list<regexp::match_element> lst;
00247 
00248   int subpatterns;
00249   int namecount;
00250   int nameentrysize;
00251   char *nametable;
00252   size_t idx = 0;
00253 
00254   pcre *re = static_cast <pcre *> (data);
00255 
00256   pcre_fullinfo (re, 0, PCRE_INFO_CAPTURECOUNT,  &subpatterns);
00257   pcre_fullinfo (re, 0, PCRE_INFO_NAMECOUNT, &namecount);
00258   pcre_fullinfo (re, 0, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize);
00259   pcre_fullinfo (re, 0, PCRE_INFO_NAMETABLE, &nametable);
00260 
00261   OCTAVE_LOCAL_BUFFER (int, ovector, (subpatterns+1)*3);
00262   OCTAVE_LOCAL_BUFFER (int, nidx, namecount);
00263 
00264   for (int i = 0; i < namecount; i++)
00265     {
00266       // Index of subpattern in first two bytes MSB first of name.
00267       // Extract index.
00268       nidx[i] = (static_cast<int> (nametable[i*nameentrysize])) << 8
00269         | static_cast<int> (nametable[i*nameentrysize+1]);
00270     }
00271 
00272   while (true)
00273     {
00274       OCTAVE_QUIT;
00275 
00276       int matches = pcre_exec (re, 0, buffer.c_str (),
00277                                buffer.length (), idx,
00278                                (idx ? PCRE_NOTBOL : 0),
00279                                ovector, (subpatterns+1)*3);
00280 
00281       if (matches == PCRE_ERROR_MATCHLIMIT)
00282         {
00283           // Try harder; start with default value for MATCH_LIMIT
00284           // and increase it.
00285           (*current_liboctave_warning_handler)
00286             ("your pattern caused PCRE to hit its MATCH_LIMIT; trying harder now, but this will be slow");
00287 
00288           pcre_extra pe;
00289 
00290           pcre_config (PCRE_CONFIG_MATCH_LIMIT,
00291                        static_cast <void *> (&pe.match_limit));
00292 
00293           pe.flags = PCRE_EXTRA_MATCH_LIMIT;
00294 
00295           int i = 0;
00296           while (matches == PCRE_ERROR_MATCHLIMIT
00297                  && i++ < PCRE_MATCHLIMIT_MAX)
00298             {
00299               OCTAVE_QUIT;
00300 
00301               pe.match_limit *= 10;
00302               matches = pcre_exec (re, &pe, buffer.c_str (),
00303                                    buffer.length (), idx,
00304                                    (idx ? PCRE_NOTBOL : 0),
00305                                    ovector, (subpatterns+1)*3);
00306             }
00307         }
00308 
00309       if (matches < 0 && matches != PCRE_ERROR_NOMATCH)
00310         {
00311           (*current_liboctave_error_handler)
00312             ("%s: internal error calling pcre_exec; error code from pcre_exec is %i",
00313              who.c_str (), matches);
00314           return retval;
00315         }
00316       else if (matches == PCRE_ERROR_NOMATCH)
00317         break;
00318       else if (ovector[1] <= ovector[0])
00319         {
00320           // Zero sized match.  Skip to next char.
00321           idx = ovector[0] + 1;
00322           if (idx < buffer.length ())
00323             continue;
00324           else
00325             break;
00326         }
00327       else
00328         {
00329           int pos_match = 0;
00330           Matrix token_extents (matches-1, 2);
00331 
00332           for (int i = 1; i < matches; i++)
00333             {
00334               if (ovector[2*i] >= 0 && ovector[2*i+1] > 0
00335                   && (i == 1 || ovector[2*i] != ovector[2*i-2]
00336                       || ovector[2*i-1] != ovector[2*i+1])
00337                   && ovector[2*i] >= 0 && ovector[2*i+1] > 0)
00338                 {
00339                   token_extents(pos_match,0) = double (ovector[2*i]+1);
00340                   token_extents(pos_match++,1) = double (ovector[2*i+1]);
00341                 }
00342             }
00343 
00344           token_extents.resize (pos_match, 2);
00345 
00346           double start = double (ovector[0]+1);
00347           double end = double (ovector[1]);
00348 
00349           const char **listptr;
00350           int status = pcre_get_substring_list (buffer.c_str (), ovector,
00351                                                 matches, &listptr);
00352 
00353           if (status == PCRE_ERROR_NOMEMORY)
00354             {
00355               (*current_liboctave_error_handler)
00356                 ("%s: cannot allocate memory in pcre_get_substring_list",
00357                  who.c_str ());
00358               return retval;
00359             }
00360 
00361           string_vector tokens (pos_match);
00362           string_vector named_tokens (nnames);
00363           int pos_offset = 0;
00364           pos_match = 0;
00365 
00366           for (int i = 1; i < matches; i++)
00367             {
00368               if (ovector[2*i] >= 0 && ovector[2*i+1] > 0)
00369                 {
00370                   if (i == 1 || ovector[2*i] != ovector[2*i-2]
00371                       || ovector[2*i-1] != ovector[2*i+1])
00372                     {
00373                       if (namecount > 0)
00374                         {
00375                           // FIXME: Should probably do this with a map()
00376                           // rather than a linear search.  However,
00377                           // the number of captured, named expressions
00378                           // is usually pretty small (< 4)
00379                           for (int j = 0; j < namecount; j++)
00380                             {
00381                               if (nidx[j] == i)
00382                                 { 
00383                                   named_tokens(named_idx(j)) =
00384                                     std::string (*(listptr+i-pos_offset));
00385                                   break;
00386                                 }
00387                             }
00388                         }
00389 
00390                       tokens(pos_match++) = std::string (*(listptr+i));
00391                     }
00392                   else
00393                     pos_offset++;
00394                 }
00395             }
00396 
00397           std::string match_string = std::string (*listptr);
00398 
00399           pcre_free_substring_list (listptr);
00400 
00401           regexp::match_element new_elem (named_tokens, tokens, match_string,
00402                                           token_extents, start, end);
00403           lst.push_back (new_elem);
00404           idx = ovector[1];
00405 
00406           if (options.once () || idx >= buffer.length ())
00407             break;
00408         }
00409     }
00410 
00411   retval = regexp::match_data (lst, named_pats);
00412 
00413   return retval;
00414 }
00415 
00416 bool
00417 regexp::is_match (const std::string& buffer)
00418 {
00419   regexp::match_data rx_lst = match (buffer);
00420 
00421   regexp::match_data::const_iterator p = rx_lst.begin ();
00422 
00423   std::string match_string = p->match_string ();
00424 
00425   return ! match_string.empty ();
00426 }
00427 
00428 Array<bool>
00429 regexp::is_match (const string_vector& buffer)
00430 {
00431   octave_idx_type len = buffer.length ();
00432 
00433   Array<bool> retval (dim_vector (len, 1));
00434 
00435   for (octave_idx_type i = 0; i < buffer.length (); i++)
00436     retval(i) = is_match (buffer(i));
00437 
00438   return retval;
00439 }
00440 
00441 std::string
00442 regexp::replace (const std::string& buffer, const std::string& replacement)
00443 {
00444   std::string retval;
00445 
00446   // Identify replacement tokens; build a vector of group numbers in
00447   // the replacement string so that we can quickly calculate the size
00448   // of the replacement.
00449 
00450   int tokens = 0;
00451   for (size_t i=1; i < replacement.size (); i++)
00452     {
00453       if (replacement[i-1]=='$' && isdigit (replacement[i]))
00454         {
00455           tokens++;
00456           i++;
00457         }
00458     }
00459   std::vector<int> token (tokens);
00460 
00461   int kk = 0;
00462   for (size_t i = 1; i < replacement.size (); i++)
00463     {
00464       if (replacement[i-1]=='$' && isdigit (replacement[i]))
00465         {
00466           token[kk++] = replacement[i]-'0';
00467           i++;
00468         }
00469     }
00470 
00471   regexp::match_data rx_lst = match (buffer);
00472 
00473   size_t sz = rx_lst.size ();
00474 
00475   if (sz == 0)
00476     {
00477       retval = buffer;
00478       return retval;
00479     }
00480 
00481   std::string rep;
00482 
00483   if (tokens > 0)
00484     {
00485       // Determine replacement length
00486       const size_t replen = replacement.size () - 2*tokens;
00487       int delta = 0;
00488       regexp::match_data::const_iterator p = rx_lst.begin ();
00489       for (size_t i = 0; i < sz; i++)
00490         {
00491           OCTAVE_QUIT;
00492 
00493           double start = p->start ();
00494           double end = p->end ();
00495 
00496           const Matrix pairs (p->token_extents ());
00497           size_t pairlen = 0;
00498           for (int j = 0; j < tokens; j++)
00499             {
00500               if (token[j] == 0)
00501                 pairlen += static_cast<size_t> (end - start) + 1;
00502               else if (token[j] <= pairs.rows ())
00503                 pairlen += static_cast<size_t> (pairs(token[j]-1,1)
00504                                                 - pairs(token[j]-1,0)) + 1;
00505             }
00506           delta += (static_cast<int> (replen + pairlen)
00507                     - static_cast<int> (end - start + 1));
00508           p++;
00509         }
00510 
00511       // Build replacement string
00512       rep.reserve (buffer.size () + delta);
00513       size_t from = 0;
00514       p = rx_lst.begin ();
00515       for (size_t i = 0; i < sz; i++)
00516         {
00517           OCTAVE_QUIT;
00518 
00519           double start = p->start ();
00520           double end = p->end ();
00521 
00522           const Matrix pairs (p->token_extents ());
00523           rep.append (&buffer[from], static_cast<size_t> (start - 1) - from);
00524           from = static_cast<size_t> (end - 1) + 1;
00525 
00526           for (size_t j = 1; j < replacement.size (); j++)
00527             {
00528               if (replacement[j-1]=='$' && isdigit (replacement[j]))
00529                 {
00530                   int k = replacement[j]-'0';
00531                   if (k == 0)
00532                     {
00533                       // replace with entire match
00534                       rep.append (&buffer[static_cast<size_t> (end - 1)],
00535                                   static_cast<size_t> (end - start) + 1);
00536                     }
00537                   else if (k <= pairs.rows ())
00538                     {
00539                       // replace with group capture
00540                       rep.append (&buffer[static_cast<size_t> (pairs(k-1,0)-1)],
00541                                   static_cast<size_t> (pairs(k-1,1)
00542                                                        - pairs(k-1,0)) + 1);
00543                     }
00544                   else
00545                     {
00546                       // replace with nothing
00547                     }
00548                   j++;
00549                 }
00550               else
00551                 rep.append (1, replacement[j-1]);
00552 
00553               if (j+1 == replacement.size ())
00554                 rep.append (1, replacement[j]);
00555             }
00556           p++;
00557         }
00558       rep.append (&buffer[from], buffer.size () - from);
00559     }
00560   else
00561     {
00562       // Determine replacement length
00563       const size_t replen = replacement.size ();
00564       int delta = 0;
00565       regexp::match_data::const_iterator p = rx_lst.begin ();
00566       for (size_t i = 0; i < sz; i++)
00567         {
00568           OCTAVE_QUIT;
00569           delta += static_cast<int> (replen)
00570             - static_cast<int> (p->end () - p->start () + 1);
00571           p++;
00572         }
00573 
00574       // Build replacement string
00575       rep.reserve (buffer.size () + delta);
00576       size_t from = 0;
00577       p = rx_lst.begin ();
00578       for (size_t i = 0; i < sz; i++)
00579         {
00580           OCTAVE_QUIT;
00581           rep.append (&buffer[from],
00582                       static_cast<size_t> (p->start () - 1) - from);
00583           from = static_cast<size_t> (p->end () - 1) + 1;
00584           rep.append (replacement);
00585           p++;
00586         }
00587       rep.append (&buffer[from], buffer.size () - from);
00588     }
00589 
00590   retval = rep;
00591   return retval;
00592 }
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Friends Defines