GNU Octave  4.4.1
A high-level interpreted language, primarily intended for numerical computations, mostly compatible with Matlab
strfind.cc
Go to the documentation of this file.
1 /*
2 
3 Copyright (C) 2009-2018 Jaroslav Hajek
4 Copyright (C) 2009-2010 VZLU Prague
5 
6 This file is part of Octave.
7 
8 Octave is free software: you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12 
13 Octave is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with Octave; see the file COPYING. If not, see
20 <https://www.gnu.org/licenses/>.
21 
22 */
23 
24 #if defined (HAVE_CONFIG_H)
25 # include "config.h"
26 #endif
27 
28 #include <algorithm>
29 #include <deque>
30 #include <limits>
31 #include <string>
32 
33 #include "oct-locbuf.h"
34 
35 #include "Cell.h"
36 #include "builtin-defun-decls.h"
37 #include "defun.h"
38 #include "errwarn.h"
39 #include "ov.h"
40 #include "unwind-prot.h"
41 #include "utils.h"
42 
43 // This allows safe indexing with char.
44 // In C++, char may be (and often is) signed!
45 #define ORD(ch) static_cast<unsigned char>(ch)
46 #define TABSIZE (std::numeric_limits<unsigned char>::max () + 1)
47 
48 // This is the quick search algorithm, as described at
49 // http://www-igm.univ-mlv.fr/~lecroq/string/node19.html
50 static void
51 qs_preprocess (const Array<char>& needle,
52  octave_idx_type *table)
53 {
54  const char *x = needle.data ();
55  octave_idx_type m = needle.numel ();
56 
57  for (octave_idx_type i = 0; i < TABSIZE; i++)
58  table[i] = m + 1;
59  for (octave_idx_type i = 0; i < m; i++)
60  table[ORD(x[i])] = m - i;
61 }
62 
64 qs_search (const Array<char>& needle,
65  const Array<char>& haystack,
66  const octave_idx_type *table,
67  bool overlaps = true)
68 {
69  const char *x = needle.data ();
70  octave_idx_type m = needle.numel ();
71  const char *y = haystack.data ();
72  octave_idx_type n = haystack.numel ();
73 
74  // We'll use deque because it typically has the most favorable properties for
75  // the operation we need.
76  std::deque<octave_idx_type> accum;
77  if (m == 1)
78  {
79  // Looking for a single character.
80  for (octave_idx_type i = 0; i < n; i++)
81  {
82  if (y[i] == x[0])
83  accum.push_back (i);
84  }
85  }
86  else if (m == 2)
87  {
88  // Two characters.
89  if (overlaps)
90  {
91  for (octave_idx_type i = 0; i < n-1; i++)
92  {
93  if (y[i] == x[0] && y[i+1] == x[1])
94  accum.push_back (i);
95  }
96  }
97  else
98  {
99  for (octave_idx_type i = 0; i < n-1; i++)
100  {
101  if (y[i] == x[0] && y[i+1] == x[1])
102  accum.push_back (i++);
103  }
104  }
105  }
106  else if (n >= m)
107  {
108  // General case.
109  octave_idx_type j = 0;
110 
111  if (overlaps)
112  {
113  while (j < n - m)
114  {
115  if (std::equal (x, x + m, y + j))
116  accum.push_back (j);
117  j += table[ORD(y[j + m])];
118  }
119  }
120  else
121  {
122  while (j < n - m)
123  {
124  if (std::equal (x, x + m, y + j))
125  {
126  accum.push_back (j);
127  j += m;
128  }
129  else
130  j += table[ORD(y[j + m])];
131  }
132  }
133 
134  if (j == n - m && std::equal (x, x + m, y + j))
135  accum.push_back (j);
136  }
137 
138  octave_idx_type nmatch = accum.size ();
139  octave_idx_type one = 1;
140  Array<octave_idx_type> result (dim_vector (std::min (one, nmatch), nmatch));
141  octave_idx_type k = 0;
142  for (const auto& idx : accum)
143  result.xelem (k++) = idx;
144 
145  return result;
146 }
147 
148 DEFUN (strfind, args, ,
149  doc: /* -*- texinfo -*-
150 @deftypefn {} {@var{idx} =} strfind (@var{str}, @var{pattern})
151 @deftypefnx {} {@var{idx} =} strfind (@var{cellstr}, @var{pattern})
152 @deftypefnx {} {@var{idx} =} strfind (@dots{}, "overlaps", @var{val})
153 Search for @var{pattern} in the string @var{str} and return the starting
154 index of every such occurrence in the vector @var{idx}.
155 
156 If there is no such occurrence, or if @var{pattern} is longer than
157 @var{str}, or if @var{pattern} itself is empty, then @var{idx} is the empty
158 array @code{[]}.
159 
160 The optional argument @qcode{"overlaps"} determines whether the pattern
161 can match at every position in @var{str} (true), or only for unique
162 occurrences of the complete pattern (false). The default is true.
163 
164 If a cell array of strings @var{cellstr} is specified then @var{idx} is a
165 cell array of vectors, as specified above.
166 
167 Examples:
168 
169 @example
170 @group
171 strfind ("abababa", "aba")
172  @result{} [1, 3, 5]
173 
174 strfind ("abababa", "aba", "overlaps", false)
175  @result{} [1, 5]
176 
177 strfind (@{"abababa", "bebebe", "ab"@}, "aba")
178  @result{}
179  @{
180  [1,1] =
181 
182  1 3 5
183 
184  [1,2] = [](1x0)
185  [1,3] = [](1x0)
186  @}
187 @end group
188 @end example
189 @seealso{findstr, strmatch, regexp, regexpi, find}
190 @end deftypefn */)
191 {
192  int nargin = args.length ();
193 
194  if (nargin != 4 && nargin != 2)
195  print_usage ();
196 
197  bool overlaps = true;
198  if (nargin == 4)
199  {
200  if (! args(2).is_string () || ! args(3).is_scalar_type ())
201  error ("strfind: invalid optional arguments");
202 
203  std::string opt = args(2).string_value ();
204 
205  if (opt == "overlaps")
206  overlaps = args(3).bool_value ();
207  else
208  error ("strfind: unknown option: %s", opt.c_str ());
209  }
210 
212 
213  octave_value argstr = args(0);
214  octave_value argpat = args(1);
215 
216  if (argpat.is_string ())
217  {
218  Array<char> needle = argpat.char_array_value ();
220  qs_preprocess (needle, table);
221 
222  if (argstr.is_string ())
223  if (argpat.isempty ())
224  // Return a null matrix for null pattern for MW compatibility
225  retval = Matrix ();
226  else
227  retval = octave_value (qs_search (needle,
228  argstr.char_array_value (),
229  table, overlaps),
230  true, true);
231  else if (argstr.iscell ())
232  {
233  const Cell argsc = argstr.cell_value ();
234  Cell retc (argsc.dims ());
235  octave_idx_type ns = argsc.numel ();
236 
237  for (octave_idx_type i = 0; i < ns; i++)
238  {
239  octave_value argse = argsc(i);
240  if (! argse.is_string ())
241  error ("strfind: each element of CELLSTR must be a string");
242 
243  if (argpat.isempty ())
244  retc(i) = Matrix ();
245  else
246  retc(i) = octave_value (qs_search (needle,
247  argse.char_array_value (),
248  table, overlaps),
249  true, true);
250  }
251 
252  retval = retc;
253  }
254  else
255  error ("strfind: first argument must be a string or cell array of strings");
256  }
257  else if (argpat.iscell ())
258  retval = do_simple_cellfun (Fstrfind, "strfind", args);
259  else
260  error ("strfind: PATTERN must be a string or cell array of strings");
261 
262  return retval;
263 }
264 
265 /*
266 %!assert (strfind ("abababa", "aba"), [1, 3, 5])
267 %!assert (strfind ("abababa", "aba", "overlaps", false), [1, 5])
268 %!assert (strfind ({"abababa", "bla", "bla"}, "a"), {[1, 3, 5, 7], 3, 3})
269 %!assert (strfind ("Linux _is_ user-friendly. It just isn't ignorant-friendly or idiot-friendly.", "friendly"), [17, 50, 68])
270 %!assert (strfind ("abc", ""), [])
271 %!assert (strfind ("abc", {"", "b", ""}), {[], 2, []})
272 %!assert (strfind ({"abc", "def"}, ""), {[], []})
273 
274 %!error strfind ()
275 %!error strfind ("foo", "bar", 1)
276 %!error <unknown option: foobar> strfind ("foo", 100, "foobar", 1)
277 %!error <each element of CELLSTR must be a string> strfind ({"A", 1}, "foo")
278 %!error <first argument must be a string> strfind (100, "foo")
279 %!error <PATTERN must be a string> strfind ("foo", 100)
280 */
281 
282 static Array<char>
283 qs_replace (const Array<char>& str, const Array<char>& pat,
284  const Array<char>& rep,
285  const octave_idx_type *table,
286  bool overlaps = true)
287 {
288  Array<char> ret = str;
289 
290  octave_idx_type siz = str.numel ();
291  octave_idx_type psiz = pat.numel ();
292  octave_idx_type rsiz = rep.numel ();
293 
294  if (psiz != 0)
295  {
296  // Look up matches, without overlaps.
297  const Array<octave_idx_type> idx = qs_search (pat, str, table, overlaps);
298  octave_idx_type nidx = idx.numel ();
299 
300  if (nidx)
301  {
302  // Compute result size.
303  octave_idx_type retsiz;
304  if (overlaps)
305  {
306  retsiz = 0;
307  // OMG. Is this the "right answer" MW always looks for, or
308  // someone was just lazy?
309  octave_idx_type k = 0;
310  for (octave_idx_type i = 0; i < nidx; i++)
311  {
312  octave_idx_type j = idx(i);
313  if (j >= k)
314  retsiz += j - k;
315  retsiz += rsiz;
316  k = j + psiz;
317  }
318 
319  retsiz += siz - k;
320  }
321  else
322  retsiz = siz + nidx * (rsiz - psiz);
323 
324  if (retsiz == 0)
325  ret.clear (dim_vector (0, 0));
326  else
327  {
328  ret.clear (dim_vector (1, retsiz));
329  const char *src = str.data ();
330  const char *reps = rep.data ();
331  char *dest = ret.fortran_vec ();
332 
333  octave_idx_type k = 0;
334  for (octave_idx_type i = 0; i < nidx; i++)
335  {
336  octave_idx_type j = idx(i);
337  if (j >= k)
338  dest = std::copy (src + k, src + j, dest);
339  dest = std::copy (reps, reps + rsiz, dest);
340  k = j + psiz;
341  }
342 
343  std::copy (src + k, src + siz, dest);
344  }
345  }
346  }
347 
348  return ret;
349 }
350 
351 DEFUN (strrep, args, ,
352  doc: /* -*- texinfo -*-
353 @deftypefn {} {@var{newstr} =} strrep (@var{str}, @var{ptn}, @var{rep})
354 @deftypefnx {} {@var{newstr} =} strrep (@var{cellstr}, @var{ptn}, @var{rep})
355 @deftypefnx {} {@var{newstr} =} strrep (@dots{}, "overlaps", @var{val})
356 Replace all occurrences of the pattern @var{ptn} in the string @var{str}
357 with the string @var{rep} and return the result.
358 
359 The optional argument @qcode{"overlaps"} determines whether the pattern
360 can match at every position in @var{str} (true), or only for unique
361 occurrences of the complete pattern (false). The default is true.
362 
363 @var{s} may also be a cell array of strings, in which case the replacement
364 is done for each element and a cell array is returned.
365 
366 Example:
367 
368 @example
369 @group
370 strrep ("This is a test string", "is", "&%$")
371  @result{} "Th&%$ &%$ a test string"
372 @end group
373 @end example
374 
375 @seealso{regexprep, strfind, findstr}
376 @end deftypefn */)
377 {
378  int nargin = args.length ();
379 
380  if (nargin != 3 && nargin != 5)
381  print_usage ();
382 
383  bool overlaps = true;
384 
385  if (nargin == 5)
386  {
387  if (! args(3).is_string () || ! args(4).is_scalar_type ())
388  error ("strrep: invalid optional arguments");
389 
390  std::string opt = args(3).string_value ();
391  if (opt == "overlaps")
392  overlaps = args(4).bool_value ();
393  else
394  error ("strrep: unknown option: %s", opt.c_str ());
395  }
396 
398 
399  octave_value argstr = args(0);
400  octave_value argpat = args(1);
401  octave_value argrep = args(2);
402 
403  if (argpat.is_string () && argrep.is_string ())
404  {
405  const Array<char> pat = argpat.char_array_value ();
406  const Array<char> rep = argrep.char_array_value ();
407 
409  qs_preprocess (pat, table);
410 
411  if (argstr.is_string ())
412  retval = qs_replace (argstr.char_array_value (), pat, rep,
413  table, overlaps);
414  else if (argstr.iscell ())
415  {
416  const Cell argsc = argstr.cell_value ();
417  Cell retc (argsc.dims ());
418  octave_idx_type ns = argsc.numel ();
419 
420  for (octave_idx_type i = 0; i < ns; i++)
421  {
422  octave_value argse = argsc(i);
423  if (argse.is_string ())
424  retc(i) = qs_replace (argse.char_array_value (), pat, rep,
425  table, overlaps);
426  else
427  error ("strrep: each element of S must be a string");
428  }
429 
430  retval = retc;
431  }
432  else
433  error ("strrep: S must be a string or cell array of strings");
434  }
435  else if (argpat.iscell () || argrep.iscell ())
436  retval = do_simple_cellfun (Fstrrep, "strrep", args);
437  else
438  error ("strrep: PTN and REP arguments must be strings or cell arrays of strings");
439 
440  return retval;
441 }
442 
443 /*
444 %!assert (strrep ("This is a test string", "is", "&%$"),
445 %! "Th&%$ &%$ a test string")
446 %!assert (strrep ("abababc", "abab", "xyz"), "xyzxyzc")
447 %!assert (strrep ("abababc", "abab", "xyz", "overlaps", false), "xyzabc")
448 
449 %!assert (size (strrep ("a", "a", "")), [0 0])
450 
451 %!error strrep ()
452 %!error strrep ("foo", "bar", 3, 4)
453 */
Definition: Cell.h:37
bool isempty(void) const
Definition: ov.h:529
const T * data(void) const
Definition: Array.h:582
OCTINTERP_API void print_usage(void)
Definition: defun.cc:54
for large enough k
Definition: lu.cc:617
const T * fortran_vec(void) const
Definition: Array.h:584
static Array< octave_idx_type > qs_search(const Array< char > &needle, const Array< char > &haystack, const octave_idx_type *table, bool overlaps=true)
Definition: strfind.cc:64
#define DEFUN(name, args_name, nargout_name, doc)
Macro to define a builtin function.
Definition: defun.h:53
void error(const char *fmt,...)
Definition: error.cc:578
const dim_vector & dims(void) const
Return a const-reference so that dims ()(i) works efficiently.
Definition: Array.h:442
virtual octave_idx_type numel(const octave_value_list &)
Definition: ov-base.cc:193
OCTAVE_EXPORT octave_value_list Fstrfind(const octave_value_list &args, int) or if ar
Definition: strfind.cc:190
octave_base_value * rep
The real representation.
Definition: ov.h:1505
charNDArray char_array_value(bool frc_str_conv=false) const
Definition: ov.h:878
bool iscell(void) const
Definition: ov.h:536
std::string str
Definition: hash.cc:118
octave_value retval
Definition: data.cc:6246
Definition: dMatrix.h:36
#define ORD(ch)
Definition: strfind.cc:45
With real return the complex result
Definition: data.cc:3260
return octave_value(v1.char_array_value() . concat(v2.char_array_value(), ra_idx),((a1.is_sq_string()||a2.is_sq_string()) ? '\'' :'"'))
void clear(void)
Definition: Array.cc:86
OCTINTERP_API octave_value_list do_simple_cellfun(octave_value_list(*fun)(const octave_value_list &, int), const char *fun_name, const octave_value_list &args, int nargout)
the element is set to zero In other the statement xample y
Definition: data.cc:5264
#define OCTAVE_LOCAL_BUFFER(T, buf, size)
Definition: oct-locbuf.h:41
args.length() nargin
Definition: file-io.cc:589
for i
Definition: data.cc:5264
bool is_string(void) const
Definition: ov.h:577
static void qs_preprocess(const Array< char > &needle, octave_idx_type *table)
Definition: strfind.cc:51
octave_idx_type numel(void) const
Number of elements in the array.
Definition: Array.h:366
Vector representing the dimensions (size) of an Array.
Definition: dim-vector.h:87
If this string is the system will ring the terminal sometimes it is useful to be able to print the original representation of the string
Definition: utils.cc:888
#define TABSIZE
Definition: strfind.cc:46
bool is_scalar_type(void) const
Definition: ov.h:717
Cell cell_value(void) const
F77_RET_T const F77_REAL const F77_REAL F77_REAL &F77_RET_T const F77_DBLE const F77_DBLE F77_DBLE &F77_RET_T const F77_DBLE F77_DBLE &F77_RET_T const F77_REAL F77_REAL &F77_RET_T const F77_DBLE * x
charNDArray min(char d, const charNDArray &m)
Definition: chNDArray.cc:204