GNU Octave  3.8.0
A high-level interpreted language, primarily intended for numerical computations, mostly compatible with Matlab
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
strfind.cc
Go to the documentation of this file.
1 /*
2 
3 Copyright (C) 2009-2013 Jaroslav Hajek
4 Copyright (C) 2009-2010 VZLU Prague
5 
6 This file is part of Octave.
7 
8 Octave is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12 
13 Octave is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with Octave; see the file COPYING. If not, see
20 <http://www.gnu.org/licenses/>.
21 
22 */
23 
24 #ifdef HAVE_CONFIG_H
25 #include <config.h>
26 #endif
27 
28 #include <algorithm>
29 #include <deque>
30 #include <limits>
31 #include <string>
32 
33 #include "Cell.h"
34 #include "ov.h"
35 #include "defun.h"
36 #include "unwind-prot.h"
37 #include "gripes.h"
38 #include "utils.h"
39 
40 // This allows safe indexing with char.
41 // In C++, char may be (and often is) signed!
42 #define ORD(ch) static_cast<unsigned char>(ch)
43 #define TABSIZE (std::numeric_limits<unsigned char>::max () + 1)
44 
45 // This is the quick search algorithm, as described at
46 // http://www-igm.univ-mlv.fr/~lecroq/string/node19.html
47 static void
48 qs_preprocess (const Array<char>& needle,
49  octave_idx_type *table)
50 {
51  const char *x = needle.data ();
52  octave_idx_type m = needle.numel ();
53 
54  for (octave_idx_type i = 0; i < TABSIZE; i++)
55  table[i] = m + 1;
56  for (octave_idx_type i = 0; i < m; i++)
57  table[ORD(x[i])] = m - i;
58 }
59 
60 
62 qs_search (const Array<char>& needle,
63  const Array<char>& haystack,
64  const octave_idx_type *table,
65  bool overlaps = true)
66 {
67  const char *x = needle.data ();
68  octave_idx_type m = needle.numel ();
69  const char *y = haystack.data ();
70  octave_idx_type n = haystack.numel ();
71 
72  // We'll use deque because it typically has the most favorable properties for
73  // the operation we need.
74  std::deque<octave_idx_type> accum;
75  if (m == 1)
76  {
77  // Looking for a single character.
78  for (octave_idx_type i = 0; i < n; i++)
79  {
80  if (y[i] == x[0])
81  accum.push_back (i);
82  }
83  }
84  else if (m == 2)
85  {
86  // Two characters.
87  if (overlaps)
88  {
89  for (octave_idx_type i = 0; i < n-1; i++)
90  {
91  if (y[i] == x[0] && y[i+1] == x[1])
92  accum.push_back (i);
93  }
94  }
95  else
96  {
97  for (octave_idx_type i = 0; i < n-1; i++)
98  {
99  if (y[i] == x[0] && y[i+1] == x[1])
100  accum.push_back (i++);
101  }
102  }
103  }
104  else if (n >= m)
105  {
106  // General case.
107  octave_idx_type j = 0;
108 
109  if (overlaps)
110  {
111  while (j < n - m)
112  {
113  if (std::equal (x, x + m, y + j))
114  accum.push_back (j);
115  j += table[ORD(y[j + m])];
116  }
117  }
118  else
119  {
120  while (j < n - m)
121  {
122  if (std::equal (x, x + m, y + j))
123  {
124  accum.push_back (j);
125  j += m;
126  }
127  else
128  j += table[ORD(y[j + m])];
129  }
130  }
131 
132  if (j == n - m && std::equal (x, x + m, y + j))
133  accum.push_back (j);
134  }
135 
136  octave_idx_type nmatch = accum.size ();
137  octave_idx_type one = 1;
138  Array<octave_idx_type> result (dim_vector (std::min (one, nmatch), nmatch));
139  octave_idx_type k = 0;
140  for (std::deque<octave_idx_type>::const_iterator iter = accum.begin ();
141  iter != accum.end (); iter++)
142  {
143  result.xelem (k++) = *iter;
144  }
145 
146  return result;
147 }
148 
149 DEFUN (strfind, args, ,
150  "-*- texinfo -*-\n\
151 @deftypefn {Built-in Function} {@var{idx} =} strfind (@var{str}, @var{pattern})\n\
152 @deftypefnx {Built-in Function} {@var{idx} =} strfind (@var{cellstr}, @var{pattern})\n\
153 @deftypefnx {Built-in Function} {@var{idx} =} strfind (@dots{}, \"overlaps\", @var{val})\n\
154 Search for @var{pattern} in the string @var{str} and return the\n\
155 starting index of every such occurrence in the vector @var{idx}.\n\
156 \n\
157 If there is no such occurrence, or if @var{pattern} is longer\n\
158 than @var{str}, then @var{idx} is the empty array @code{[]}.\n\
159 The optional argument @qcode{\"overlaps\"} determines whether the pattern\n\
160 can match at every position in @var{str} (true), or only for unique\n\
161 occurrences of the complete pattern (false). The default is true.\n\
162 \n\
163 If a cell array of strings @var{cellstr} is specified\n\
164 then @var{idx} is a cell array of vectors, as specified above.\n\
165 \n\
166 Examples:\n\
167 \n\
168 @example\n\
169 @group\n\
170 strfind (\"abababa\", \"aba\")\n\
171  @result{} [1, 3, 5]\n\
172 \n\
173 strfind (\"abababa\", \"aba\", \"overlaps\", false)\n\
174  @result{} [1, 5]\n\
175 \n\
176 strfind (@{\"abababa\", \"bebebe\", \"ab\"@}, \"aba\")\n\
177  @result{}\n\
178  @{\n\
179  [1,1] =\n\
180 \n\
181  1 3 5\n\
182 \n\
183  [1,2] = [](1x0)\n\
184  [1,3] = [](1x0)\n\
185  @}\n\
186 @end group\n\
187 @end example\n\
188 @seealso{findstr, strmatch, regexp, regexpi, find}\n\
189 @end deftypefn")
190 {
191  octave_value retval;
192  int nargin = args.length ();
193  bool overlaps = true;
194 
195  if (nargin == 4 && args(2).is_string () && args(3).is_scalar_type ())
196  {
197  std::string opt = args(2).string_value ();
198  if (opt == "overlaps")
199  {
200  overlaps = args(3).bool_value ();
201  nargin = 2;
202  }
203  else
204  {
205  error ("strfind: unknown option: %s", opt.c_str ());
206  return retval;
207  }
208  }
209 
210  if (nargin == 2)
211  {
212  octave_value argstr = args(0), argpat = args(1);
213  if (argpat.is_string ())
214  {
215  Array<char> needle = argpat.char_array_value ();
217  qs_preprocess (needle, table);
218 
219  if (argstr.is_string ())
220  retval = octave_value (qs_search (needle,
221  argstr.char_array_value (),
222  table, overlaps),
223  true, true);
224  else if (argstr.is_cell ())
225  {
226  const Cell argsc = argstr.cell_value ();
227  Cell retc (argsc.dims ());
228  octave_idx_type ns = argsc.numel ();
229 
230  for (octave_idx_type i = 0; i < ns; i++)
231  {
232  octave_value argse = argsc(i);
233  if (argse.is_string ())
234  retc(i)
235  = octave_value (qs_search (needle,
236  argse.char_array_value (),
237  table, overlaps),
238  true, true);
239  else
240  {
241  error ("strfind: each element of CELLSTR must be a string");
242  break;
243  }
244  }
245 
246  retval = retc;
247  }
248  else
249  error ("strfind: first argument must be a string or cell array of strings");
250  }
251  else if (argpat.is_cell ())
252  retval = do_simple_cellfun (Fstrfind, "strfind", args);
253  else
254  error ("strfind: PATTERN must be a string or cell array of strings");
255  }
256  else
257  print_usage ();
258 
259  return retval;
260 }
261 
262 /*
263 %!assert (strfind ("abababa", "aba"), [1, 3, 5])
264 %!assert (strfind ("abababa", "aba", "overlaps", false), [1, 5])
265 %!assert (strfind ({"abababa", "bla", "bla"}, "a"), {[1, 3, 5, 7], 3, 3})
266 %!assert (strfind ("Linux _is_ user-friendly. It just isn't ignorant-friendly or idiot-friendly.", "friendly"), [17, 50, 68])
267 
268 %!error strfind ()
269 %!error strfind ("foo", "bar", 1)
270 %!error <PATTERN must be a string> strfind ("foo", 100)
271 %!error <first argument must be a string> strfind (100, "foo")
272 */
273 
274 static Array<char>
275 qs_replace (const Array<char>& str, const Array<char>& pat,
276  const Array<char>& rep,
277  const octave_idx_type *table,
278  bool overlaps = true)
279 {
280  Array<char> ret = str;
281 
282  octave_idx_type siz = str.numel (), psiz = pat.numel (), rsiz = rep.numel ();
283 
284  if (psiz != 0)
285  {
286  // Look up matches, without overlaps.
287  const Array<octave_idx_type> idx = qs_search (pat, str, table, overlaps);
288  octave_idx_type nidx = idx.numel ();
289 
290  if (nidx)
291  {
292  // Compute result size.
293  octave_idx_type retsiz;
294  if (overlaps)
295  {
296  retsiz = 0;
297  // OMG. Is this the "right answer" MW always looks for, or
298  // someone was just lazy?
299  octave_idx_type k = 0;
300  for (octave_idx_type i = 0; i < nidx; i++)
301  {
302  octave_idx_type j = idx(i);
303  if (j >= k)
304  retsiz += j - k;
305  retsiz += rsiz;
306  k = j + psiz;
307  }
308 
309  retsiz += siz - k;
310  }
311  else
312  retsiz = siz + nidx * (rsiz - psiz);
313 
314  ret.clear (dim_vector (1, retsiz));
315  const char *src = str.data (), *reps = rep.data ();
316  char *dest = ret.fortran_vec ();
317 
318  octave_idx_type k = 0;
319  for (octave_idx_type i = 0; i < nidx; i++)
320  {
321  octave_idx_type j = idx(i);
322  if (j >= k)
323  dest = std::copy (src + k, src + j, dest);
324  dest = std::copy (reps, reps + rsiz, dest);
325  k = j + psiz;
326  }
327 
328  std::copy (src + k, src + siz, dest);
329  }
330  }
331 
332  return ret;
333 }
334 
335 DEFUN (strrep, args, ,
336  "-*- texinfo -*-\n\
337 @deftypefn {Built-in Function} {@var{newstr} =} strrep (@var{str}, @var{ptn}, @var{rep})\n\
338 @deftypefnx {Built-in Function} {@var{newstr} =} strrep (@var{cellstr}, @var{ptn}, @var{rep})\n\
339 @deftypefnx {Built-in Function} {@var{newstr} =} strrep (@dots{}, \"overlaps\", @var{val})\n\
340 Replace all occurrences of the pattern @var{ptn} in the string @var{str}\n\
341 with the string @var{rep} and return the result.\n\
342 \n\
343 The optional argument @qcode{\"overlaps\"} determines whether the pattern\n\
344 can match at every position in @var{str} (true), or only for unique\n\
345 occurrences of the complete pattern (false). The default is true.\n\
346 \n\
347 @var{s} may also be a cell array of strings, in which case the replacement is\n\
348 done for each element and a cell array is returned.\n\
349 \n\
350 Example:\n\
351 \n\
352 @example\n\
353 @group\n\
354 strrep (\"This is a test string\", \"is\", \"&%$\")\n\
355  @result{} \"Th&%$ &%$ a test string\"\n\
356 @end group\n\
357 @end example\n\
358 \n\
359 @seealso{regexprep, strfind, findstr}\n\
360 @end deftypefn")
361 {
362  octave_value retval;
363  int nargin = args.length ();
364  bool overlaps = true;
365 
366  if (nargin == 5 && args(3).is_string () && args(4).is_scalar_type ())
367  {
368  std::string opt = args(3).string_value ();
369  if (opt == "overlaps")
370  {
371  overlaps = args(4).bool_value ();
372  nargin = 3;
373  }
374  else
375  {
376  error ("strrep: unknown option: %s", opt.c_str ());
377  return retval;
378  }
379  }
380 
381  if (nargin == 3)
382  {
383  octave_value argstr = args(0), argpat = args(1), argrep = args(2);
384  if (argpat.is_string () && argrep.is_string ())
385  {
386  const Array<char> pat = argpat.char_array_value ();
387  const Array<char> rep = argrep.char_array_value ();
388 
390  qs_preprocess (pat, table);
391 
392  if (argstr.is_string ())
393  retval = qs_replace (argstr.char_array_value (), pat, rep,
394  table, overlaps);
395  else if (argstr.is_cell ())
396  {
397  const Cell argsc = argstr.cell_value ();
398  Cell retc (argsc.dims ());
399  octave_idx_type ns = argsc.numel ();
400 
401  for (octave_idx_type i = 0; i < ns; i++)
402  {
403  octave_value argse = argsc(i);
404  if (argse.is_string ())
405  retc(i) = qs_replace (argse.char_array_value (), pat, rep,
406  table, overlaps);
407  else
408  {
409  error ("strrep: each element of S must be a string");
410  break;
411  }
412  }
413 
414  retval = retc;
415  }
416  else
417  error ("strrep: S must be a string or cell array of strings");
418  }
419  else if (argpat.is_cell () || argrep.is_cell ())
420  retval = do_simple_cellfun (Fstrrep, "strrep", args);
421  else
422  error ("strrep: PTN and REP arguments must be strings or cell arrays of strings");
423  }
424  else
425  print_usage ();
426 
427  return retval;
428 }
429 
430 /*
431 %!assert (strrep ("This is a test string", "is", "&%$"),
432 %! "Th&%$ &%$ a test string")
433 %!assert (strrep ("abababc", "abab", "xyz"), "xyzxyzc")
434 %!assert (strrep ("abababc", "abab", "xyz", "overlaps", false), "xyzabc")
435 
436 %!error strrep ()
437 %!error strrep ("foo", "bar", 3, 4)
438 */