GNU Octave  4.0.0
A high-level interpreted language, primarily intended for numerical computations, mostly compatible with Matlab
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
strfind.cc
Go to the documentation of this file.
1 /*
2 
3 Copyright (C) 2009-2015 Jaroslav Hajek
4 Copyright (C) 2009-2010 VZLU Prague
5 
6 This file is part of Octave.
7 
8 Octave is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12 
13 Octave is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with Octave; see the file COPYING. If not, see
20 <http://www.gnu.org/licenses/>.
21 
22 */
23 
24 #ifdef HAVE_CONFIG_H
25 #include <config.h>
26 #endif
27 
28 #include <algorithm>
29 #include <deque>
30 #include <limits>
31 #include <string>
32 
33 #include "oct-locbuf.h"
34 
35 #include "Cell.h"
36 #include "ov.h"
37 #include "defun.h"
38 #include "unwind-prot.h"
39 #include "gripes.h"
40 #include "utils.h"
41 
42 // This allows safe indexing with char.
43 // In C++, char may be (and often is) signed!
44 #define ORD(ch) static_cast<unsigned char>(ch)
45 #define TABSIZE (std::numeric_limits<unsigned char>::max () + 1)
46 
47 // This is the quick search algorithm, as described at
48 // http://www-igm.univ-mlv.fr/~lecroq/string/node19.html
49 static void
50 qs_preprocess (const Array<char>& needle,
51  octave_idx_type *table)
52 {
53  const char *x = needle.data ();
54  octave_idx_type m = needle.numel ();
55 
56  for (octave_idx_type i = 0; i < TABSIZE; i++)
57  table[i] = m + 1;
58  for (octave_idx_type i = 0; i < m; i++)
59  table[ORD(x[i])] = m - i;
60 }
61 
62 
64 qs_search (const Array<char>& needle,
65  const Array<char>& haystack,
66  const octave_idx_type *table,
67  bool overlaps = true)
68 {
69  const char *x = needle.data ();
70  octave_idx_type m = needle.numel ();
71  const char *y = haystack.data ();
72  octave_idx_type n = haystack.numel ();
73 
74  // We'll use deque because it typically has the most favorable properties for
75  // the operation we need.
76  std::deque<octave_idx_type> accum;
77  if (m == 1)
78  {
79  // Looking for a single character.
80  for (octave_idx_type i = 0; i < n; i++)
81  {
82  if (y[i] == x[0])
83  accum.push_back (i);
84  }
85  }
86  else if (m == 2)
87  {
88  // Two characters.
89  if (overlaps)
90  {
91  for (octave_idx_type i = 0; i < n-1; i++)
92  {
93  if (y[i] == x[0] && y[i+1] == x[1])
94  accum.push_back (i);
95  }
96  }
97  else
98  {
99  for (octave_idx_type i = 0; i < n-1; i++)
100  {
101  if (y[i] == x[0] && y[i+1] == x[1])
102  accum.push_back (i++);
103  }
104  }
105  }
106  else if (n >= m)
107  {
108  // General case.
109  octave_idx_type j = 0;
110 
111  if (overlaps)
112  {
113  while (j < n - m)
114  {
115  if (std::equal (x, x + m, y + j))
116  accum.push_back (j);
117  j += table[ORD(y[j + m])];
118  }
119  }
120  else
121  {
122  while (j < n - m)
123  {
124  if (std::equal (x, x + m, y + j))
125  {
126  accum.push_back (j);
127  j += m;
128  }
129  else
130  j += table[ORD(y[j + m])];
131  }
132  }
133 
134  if (j == n - m && std::equal (x, x + m, y + j))
135  accum.push_back (j);
136  }
137 
138  octave_idx_type nmatch = accum.size ();
139  octave_idx_type one = 1;
140  Array<octave_idx_type> result (dim_vector (std::min (one, nmatch), nmatch));
141  octave_idx_type k = 0;
142  for (std::deque<octave_idx_type>::const_iterator iter = accum.begin ();
143  iter != accum.end (); iter++)
144  {
145  result.xelem (k++) = *iter;
146  }
147 
148  return result;
149 }
150 
151 DEFUN (strfind, args, ,
152  "-*- texinfo -*-\n\
153 @deftypefn {Built-in Function} {@var{idx} =} strfind (@var{str}, @var{pattern})\n\
154 @deftypefnx {Built-in Function} {@var{idx} =} strfind (@var{cellstr}, @var{pattern})\n\
155 @deftypefnx {Built-in Function} {@var{idx} =} strfind (@dots{}, \"overlaps\", @var{val})\n\
156 Search for @var{pattern} in the string @var{str} and return the starting\n\
157 index of every such occurrence in the vector @var{idx}.\n\
158 \n\
159 If there is no such occurrence, or if @var{pattern} is longer than\n\
160 @var{str}, or if @var{pattern} itself is empty, then @var{idx} is the empty\n\
161 array @code{[]}.\n\
162 \n\
163 The optional argument @qcode{\"overlaps\"} determines whether the pattern\n\
164 can match at every position in @var{str} (true), or only for unique\n\
165 occurrences of the complete pattern (false). The default is true.\n\
166 \n\
167 If a cell array of strings @var{cellstr} is specified then @var{idx} is a\n\
168 cell array of vectors, as specified above.\n\
169 \n\
170 Examples:\n\
171 \n\
172 @example\n\
173 @group\n\
174 strfind (\"abababa\", \"aba\")\n\
175  @result{} [1, 3, 5]\n\
176 \n\
177 strfind (\"abababa\", \"aba\", \"overlaps\", false)\n\
178  @result{} [1, 5]\n\
179 \n\
180 strfind (@{\"abababa\", \"bebebe\", \"ab\"@}, \"aba\")\n\
181  @result{}\n\
182  @{\n\
183  [1,1] =\n\
184 \n\
185  1 3 5\n\
186 \n\
187  [1,2] = [](1x0)\n\
188  [1,3] = [](1x0)\n\
189  @}\n\
190 @end group\n\
191 @end example\n\
192 @seealso{findstr, strmatch, regexp, regexpi, find}\n\
193 @end deftypefn")
194 {
195  octave_value retval;
196  int nargin = args.length ();
197  bool overlaps = true;
198 
199  if (nargin == 4 && args(2).is_string () && args(3).is_scalar_type ())
200  {
201  std::string opt = args(2).string_value ();
202  if (opt == "overlaps")
203  {
204  overlaps = args(3).bool_value ();
205  nargin = 2;
206  }
207  else
208  {
209  error ("strfind: unknown option: %s", opt.c_str ());
210  return retval;
211  }
212  }
213 
214  if (nargin == 2)
215  {
216  octave_value argstr = args(0);
217  octave_value argpat = args(1);
218  if (argpat.is_string ())
219  {
220  Array<char> needle = argpat.char_array_value ();
222  qs_preprocess (needle, table);
223 
224  if (argstr.is_string ())
225  if (argpat.is_empty ())
226  // Return a null matrix for null pattern for MW compatibility
227  retval = Matrix ();
228  else
229  retval = octave_value (qs_search (needle,
230  argstr.char_array_value (),
231  table, overlaps),
232  true, true);
233  else if (argstr.is_cell ())
234  {
235  const Cell argsc = argstr.cell_value ();
236  Cell retc (argsc.dims ());
237  octave_idx_type ns = argsc.numel ();
238 
239  for (octave_idx_type i = 0; i < ns; i++)
240  {
241  octave_value argse = argsc(i);
242  if (argse.is_string ())
243  {
244  if (argpat.is_empty ())
245  retc(i) = Matrix ();
246  else
247  retc(i) = octave_value (qs_search (needle,
248  argse.char_array_value (),
249  table, overlaps),
250  true, true);
251  }
252  else
253  {
254  error ("strfind: each element of CELLSTR must be a string");
255  break;
256  }
257  }
258 
259  retval = retc;
260  }
261  else
262  error ("strfind: first argument must be a string or cell array of strings");
263  }
264  else if (argpat.is_cell ())
265  retval = do_simple_cellfun (Fstrfind, "strfind", args);
266  else
267  error ("strfind: PATTERN must be a string or cell array of strings");
268  }
269  else
270  print_usage ();
271 
272  return retval;
273 }
274 
275 /*
276 %!assert (strfind ("abababa", "aba"), [1, 3, 5])
277 %!assert (strfind ("abababa", "aba", "overlaps", false), [1, 5])
278 %!assert (strfind ({"abababa", "bla", "bla"}, "a"), {[1, 3, 5, 7], 3, 3})
279 %!assert (strfind ("Linux _is_ user-friendly. It just isn't ignorant-friendly or idiot-friendly.", "friendly"), [17, 50, 68])
280 %!assert (strfind ("abc", ""), [])
281 %!assert (strfind ("abc", {"", "b", ""}), {[], 2, []})
282 %!assert (strfind ({"abc", "def"}, ""), {[], []})
283 
284 %!error strfind ()
285 %!error strfind ("foo", "bar", 1)
286 %!error <unknown option: foobar> strfind ("foo", 100, "foobar", 1)
287 %!error <each element of CELLSTR must be a string> strfind ({"A", 1}, "foo")
288 %!error <first argument must be a string> strfind (100, "foo")
289 %!error <PATTERN must be a string> strfind ("foo", 100)
290 */
291 
292 static Array<char>
293 qs_replace (const Array<char>& str, const Array<char>& pat,
294  const Array<char>& rep,
295  const octave_idx_type *table,
296  bool overlaps = true)
297 {
298  Array<char> ret = str;
299 
300  octave_idx_type siz = str.numel ();
301  octave_idx_type psiz = pat.numel ();
302  octave_idx_type rsiz = rep.numel ();
303 
304  if (psiz != 0)
305  {
306  // Look up matches, without overlaps.
307  const Array<octave_idx_type> idx = qs_search (pat, str, table, overlaps);
308  octave_idx_type nidx = idx.numel ();
309 
310  if (nidx)
311  {
312  // Compute result size.
313  octave_idx_type retsiz;
314  if (overlaps)
315  {
316  retsiz = 0;
317  // OMG. Is this the "right answer" MW always looks for, or
318  // someone was just lazy?
319  octave_idx_type k = 0;
320  for (octave_idx_type i = 0; i < nidx; i++)
321  {
322  octave_idx_type j = idx(i);
323  if (j >= k)
324  retsiz += j - k;
325  retsiz += rsiz;
326  k = j + psiz;
327  }
328 
329  retsiz += siz - k;
330  }
331  else
332  retsiz = siz + nidx * (rsiz - psiz);
333 
334  if (retsiz == 0)
335  ret.clear (dim_vector (0, 0));
336  else
337  {
338  ret.clear (dim_vector (1, retsiz));
339  const char *src = str.data ();
340  const char *reps = rep.data ();
341  char *dest = ret.fortran_vec ();
342 
343  octave_idx_type k = 0;
344  for (octave_idx_type i = 0; i < nidx; i++)
345  {
346  octave_idx_type j = idx(i);
347  if (j >= k)
348  dest = std::copy (src + k, src + j, dest);
349  dest = std::copy (reps, reps + rsiz, dest);
350  k = j + psiz;
351  }
352 
353  std::copy (src + k, src + siz, dest);
354  }
355  }
356  }
357 
358  return ret;
359 }
360 
361 DEFUN (strrep, args, ,
362  "-*- texinfo -*-\n\
363 @deftypefn {Built-in Function} {@var{newstr} =} strrep (@var{str}, @var{ptn}, @var{rep})\n\
364 @deftypefnx {Built-in Function} {@var{newstr} =} strrep (@var{cellstr}, @var{ptn}, @var{rep})\n\
365 @deftypefnx {Built-in Function} {@var{newstr} =} strrep (@dots{}, \"overlaps\", @var{val})\n\
366 Replace all occurrences of the pattern @var{ptn} in the string @var{str}\n\
367 with the string @var{rep} and return the result.\n\
368 \n\
369 The optional argument @qcode{\"overlaps\"} determines whether the pattern\n\
370 can match at every position in @var{str} (true), or only for unique\n\
371 occurrences of the complete pattern (false). The default is true.\n\
372 \n\
373 @var{s} may also be a cell array of strings, in which case the replacement is\n\
374 done for each element and a cell array is returned.\n\
375 \n\
376 Example:\n\
377 \n\
378 @example\n\
379 @group\n\
380 strrep (\"This is a test string\", \"is\", \"&%$\")\n\
381  @result{} \"Th&%$ &%$ a test string\"\n\
382 @end group\n\
383 @end example\n\
384 \n\
385 @seealso{regexprep, strfind, findstr}\n\
386 @end deftypefn")
387 {
388  octave_value retval;
389  int nargin = args.length ();
390  bool overlaps = true;
391 
392  if (nargin == 5 && args(3).is_string () && args(4).is_scalar_type ())
393  {
394  std::string opt = args(3).string_value ();
395  if (opt == "overlaps")
396  {
397  overlaps = args(4).bool_value ();
398  nargin = 3;
399  }
400  else
401  {
402  error ("strrep: unknown option: %s", opt.c_str ());
403  return retval;
404  }
405  }
406 
407  if (nargin == 3)
408  {
409  octave_value argstr = args(0);
410  octave_value argpat = args(1);
411  octave_value argrep = args(2);
412  if (argpat.is_string () && argrep.is_string ())
413  {
414  const Array<char> pat = argpat.char_array_value ();
415  const Array<char> rep = argrep.char_array_value ();
416 
418  qs_preprocess (pat, table);
419 
420  if (argstr.is_string ())
421  retval = qs_replace (argstr.char_array_value (), pat, rep,
422  table, overlaps);
423  else if (argstr.is_cell ())
424  {
425  const Cell argsc = argstr.cell_value ();
426  Cell retc (argsc.dims ());
427  octave_idx_type ns = argsc.numel ();
428 
429  for (octave_idx_type i = 0; i < ns; i++)
430  {
431  octave_value argse = argsc(i);
432  if (argse.is_string ())
433  retc(i) = qs_replace (argse.char_array_value (), pat, rep,
434  table, overlaps);
435  else
436  {
437  error ("strrep: each element of S must be a string");
438  break;
439  }
440  }
441 
442  retval = retc;
443  }
444  else
445  error ("strrep: S must be a string or cell array of strings");
446  }
447  else if (argpat.is_cell () || argrep.is_cell ())
448  retval = do_simple_cellfun (Fstrrep, "strrep", args);
449  else
450  error ("strrep: PTN and REP arguments must be strings or cell arrays of strings");
451  }
452  else
453  print_usage ();
454 
455  return retval;
456 }
457 
458 /*
459 %!assert (strrep ("This is a test string", "is", "&%$"),
460 %! "Th&%$ &%$ a test string")
461 %!assert (strrep ("abababc", "abab", "xyz"), "xyzxyzc")
462 %!assert (strrep ("abababc", "abab", "xyz", "overlaps", false), "xyzabc")
463 
464 %!assert (size (strrep ("a", "a", "")), [0 0])
465 
466 %!error strrep ()
467 %!error strrep ("foo", "bar", 3, 4)
468 */
Definition: Cell.h:35
charNDArray char_array_value(bool frc_str_conv=false) const
Definition: ov.h:817
OCTINTERP_API void print_usage(void)
Definition: defun.cc:51
octave_idx_type numel(void) const
Number of elements in the array.
Definition: Array.h:275
bool is_scalar_type(void) const
Definition: ov.h:657
static Array< octave_idx_type > qs_search(const Array< char > &needle, const Array< char > &haystack, const octave_idx_type *table, bool overlaps=true)
Definition: strfind.cc:64
#define DEFUN(name, args_name, nargout_name, doc)
Definition: defun.h:44
void error(const char *fmt,...)
Definition: error.cc:476
bool is_cell(void) const
Definition: ov.h:529
octave_base_value * rep
Definition: ov.h:1248
Cell cell_value(void) const
Definition: ov.cc:1566
OCTAVE_EXPORT octave_value_list Fstrfind(const octave_value_list &args, int)
Definition: strfind.cc:193
const dim_vector & dims(void) const
Return a const-reference so that dims ()(i) works efficiently.
Definition: Array.h:337
bool is_string(void) const
Definition: ov.h:562
const T * data(void) const
Definition: Array.h:479
octave_idx_type length(void) const
Definition: ov.cc:1525
Definition: dMatrix.h:35
#define ORD(ch)
Definition: strfind.cc:44
T & xelem(octave_idx_type n)
Definition: Array.h:353
static Array< char > qs_replace(const Array< char > &str, const Array< char > &pat, const Array< char > &rep, const octave_idx_type *table, bool overlaps=true)
Definition: strfind.cc:293
void clear(void)
Definition: Array.cc:84
bool is_empty(void) const
Definition: ov.h:526
OCTAVE_EXPORT octave_value_list Fstrrep(const octave_value_list &args, int)
Definition: strfind.cc:386
octave_value_list do_simple_cellfun(octave_value_list(*fun)(const octave_value_list &, int), const char *fun_name, const octave_value_list &args, int nargout)
Definition: utils.cc:1437
#define OCTAVE_LOCAL_BUFFER(T, buf, size)
Definition: oct-locbuf.h:197
static void qs_preprocess(const Array< char > &needle, octave_idx_type *table)
Definition: strfind.cc:50
const T * fortran_vec(void) const
Definition: Array.h:481
#define TABSIZE
Definition: strfind.cc:45
return octave_value(v1.char_array_value().concat(v2.char_array_value(), ra_idx),((a1.is_sq_string()||a2.is_sq_string())? '\'': '"'))
F77_RET_T const double * x
charNDArray min(char d, const charNDArray &m)
Definition: chNDArray.cc:210