blocxx
PerlRegEx.cpp
Go to the documentation of this file.
1 /*******************************************************************************
2 * Copyright (C) 2005 Novell, Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * - Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 *
10 * - Redistributions in binary form must reproduce the above copyright notice,
11 * this list of conditions and the following disclaimer in the documentation
12 * and/or other materials provided with the distribution.
13 *
14 * - Neither the name of Vintela, Inc., Novell, Inc., nor the names of its
15 * contributors may be used to endorse or promote products derived from this
16 * software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL Vintela, Inc., Novell, Inc., OR THE
22 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *******************************************************************************/
34 #include "blocxx/PerlRegEx.hpp"
35 
36 #ifdef BLOCXX_HAVE_PCRE
37 #ifdef BLOCXX_HAVE_PCRE_H
38 
39 #include "blocxx/ExceptionIds.hpp"
40 #include "blocxx/Assertion.hpp"
41 #include "blocxx/Format.hpp"
42 #include <climits> // for INT_MAX
43 
44 
45 namespace BLOCXX_NAMESPACE
46 {
47 
48 
49 // -------------------------------------------------------------------
50 static String
51 substitute_caps(const PerlRegEx::MatchArray &sub,
52  const String &str, const String &rep)
53 {
54  static const char *cap_refs[] = {
55  NULL, "\\1", "\\2", "\\3", "\\4",
56  "\\5", "\\6", "\\7", "\\8", "\\9", NULL
57  };
58 
59  String res( rep);
60  size_t pos;
61 
62  for(size_t i=1; cap_refs[i] != NULL; i++)
63  {
64  String cap;
65  if( i < sub.size() && sub[i].rm_so >= 0 && sub[i].rm_eo >= 0)
66  {
67  cap = str.substring(sub[i].rm_so, sub[i].rm_eo
68  - sub[i].rm_so);
69  }
70 
71  pos = res.indexOf(cap_refs[i]);
72  while( pos != String::npos)
73  {
74  size_t quotes = 0;
75  size_t at = pos;
76 
77  while( at > 0 && res.charAt(--at) == '\\')
78  quotes++;
79 
80  if( quotes % 2)
81  {
82  quotes = (quotes + 1) / 2;
83 
84  res = res.erase(pos - quotes, quotes);
85 
86  pos = res.indexOf(cap_refs[i],
87  pos + 2 - quotes);
88  }
89  else
90  {
91  quotes = quotes / 2;
92 
93  res = res.substring(0, pos - quotes) +
94  cap +
95  res.substring(pos + 2);
96 
97  pos = res.indexOf(cap_refs[i],
98  pos + cap.length() - quotes);
99  }
100  }
101  }
102  return res;
103 }
104 
105 
106 // -------------------------------------------------------------------
107 static inline String
108 getError(const int errcode)
109 {
110  const char *ptr;
111  switch(errcode)
112  {
113  case 0:
114  ptr = "match vector to small";
115  break;
116 
117  case PCRE_ERROR_NOMATCH:
118  ptr = "match failed";
119  break;
120 
121  case PCRE_ERROR_NULL:
122  ptr = "invalid argument";
123  break;
124 
125  case PCRE_ERROR_BADOPTION:
126  ptr = "unrecognized option";
127  break;
128 
129  case PCRE_ERROR_BADMAGIC:
130  ptr = "invalid magic number";
131  break;
132 
133  case PCRE_ERROR_UNKNOWN_NODE:
134  ptr = "unknown item in the compiled pattern";
135  break;
136 
137  case PCRE_ERROR_NOMEMORY:
138  ptr = "failed to allocate memory";
139  break;
140 
141  case PCRE_ERROR_NOSUBSTRING:
142  // .*_substring.* functions only
143  ptr = "failed to retrieve substring";
144  break;
145 
146  case PCRE_ERROR_MATCHLIMIT:
147  // match_limit in pcre_extra struct
148  ptr = "recursion or backtracking limit reached";
149  break;
150 
151  case PCRE_ERROR_CALLOUT:
152  // reserved for pcrecallout functions
153  ptr = "callout failure";
154  break;
155 
156  case PCRE_ERROR_BADUTF8:
157  ptr = "invalid UTF-8 byte sequence found";
158  break;
159 
160  case PCRE_ERROR_BADUTF8_OFFSET:
161  ptr = "not a UTF-8 character at specified index";
162  break;
163 
164  case PCRE_ERROR_PARTIAL:
165  ptr = "partial match";
166  break;
167 
168  case PCRE_ERROR_BADPARTIAL:
169  ptr = "pattern item not supported for partial matching";
170  break;
171 
172  case PCRE_ERROR_INTERNAL:
173  ptr = "unexpected internal error occurred";
174  break;
175 
176  case PCRE_ERROR_BADCOUNT:
177  ptr = "invalid (negative) match vector count";
178  break;
179 
180  default:
181  ptr = "unknown error code";
182  break;
183  }
184  return String(ptr);
185 }
186 
187 // -------------------------------------------------------------------
188 PerlRegEx::PerlRegEx()
189  : m_pcre(NULL)
190  , m_flags(0)
191  , m_ecode(0)
192 {
193 }
194 
195 
196 // -------------------------------------------------------------------
197 PerlRegEx::PerlRegEx(const String &regex, int cflags)
198  : m_pcre(NULL)
199  , m_flags(0)
200  , m_ecode(0)
201 {
202  if( !compile(regex, cflags))
203  {
204  BLOCXX_THROW_ERR(RegExCompileException,
205  errorString().c_str(), m_ecode);
206  }
207 }
208 
209 
210 // -------------------------------------------------------------------
211 PerlRegEx::PerlRegEx(const PerlRegEx &ref)
212  : m_pcre(NULL)
213  , m_flags(ref.m_flags)
214  , m_ecode(0)
215  , m_rxstr(ref.m_rxstr)
216 {
217  if( ref.m_pcre != NULL && !compile(ref.m_rxstr, ref.m_flags))
218  {
219  BLOCXX_THROW_ERR(RegExCompileException,
220  errorString().c_str(), m_ecode);
221  }
222 }
223 
224 // -------------------------------------------------------------------
225 PerlRegEx::~PerlRegEx()
226 {
227  if( m_pcre)
228  {
229  free(m_pcre);
230  m_pcre = NULL;
231  }
232 }
233 
234 
235 // -------------------------------------------------------------------
236 PerlRegEx &
237 PerlRegEx::operator = (const PerlRegEx &ref)
238 {
239  if( ref.m_pcre == NULL)
240  {
241  m_ecode = 0;
242  m_error.erase();
243  m_flags = ref.m_flags;
244  m_rxstr = ref.m_rxstr;
245  if( m_pcre != NULL)
246  {
247  free(m_pcre);
248  m_pcre = NULL;
249  }
250  }
251  else if( !compile(ref.m_rxstr, ref.m_flags))
252  {
253  BLOCXX_THROW_ERR(RegExCompileException,
254  errorString().c_str(), m_ecode);
255  }
256  return *this;
257 }
258 
259 
260 // -------------------------------------------------------------------
261 bool
262 PerlRegEx::compile(const String &regex, int cflags)
263 {
264  if( m_pcre)
265  {
266  free(m_pcre);
267  m_pcre = NULL;
268  }
269 
270  const char *errptr = NULL;
271 
272  m_ecode = 0;
273  m_pcre = ::pcre_compile(regex.c_str(), cflags,
274  &errptr, &m_ecode, NULL);
275  if( m_pcre == NULL)
276  {
277  m_error = String(errptr ? errptr : "");
278  m_rxstr.erase();
279  m_flags = 0;
280  return false;
281  }
282  else
283  {
284  m_error.erase();
285  m_rxstr = regex;
286  m_flags = cflags;
287  return true;
288  }
289 }
290 
291 
292 // -------------------------------------------------------------------
293 int
294 PerlRegEx::errorCode()
295 {
296  return m_ecode;
297 }
298 
299 
300 // -------------------------------------------------------------------
301 String
302 PerlRegEx::errorString() const
303 {
304  return m_error;
305 }
306 
307 
308 // -------------------------------------------------------------------
309 String
310 PerlRegEx::patternString() const
311 {
312  return m_rxstr;
313 }
314 
315 
316 // -------------------------------------------------------------------
317 int
318 PerlRegEx::compileFlags() const
319 {
320  return m_flags;
321 }
322 
323 
324 // -------------------------------------------------------------------
325 bool
326 PerlRegEx::isCompiled() const
327 {
328  return (m_pcre != NULL);
329 }
330 
331 
332 // -------------------------------------------------------------------
333 bool
334 PerlRegEx::execute(MatchArray &sub, const String &str,
335  size_t index, size_t count, int eflags)
336 {
337  if( m_pcre == NULL)
338  {
339  BLOCXX_THROW(RegExCompileException,
340  "Regular expression is not compiled");
341  }
342  if( count >= size_t(INT_MAX / 3))
343  {
344  BLOCXX_THROW(AssertionException,
345  "Match count limit exceeded");
346  }
347 
348  if( index > str.length())
349  {
350  BLOCXX_THROW(OutOfBoundsException,
351  Format("String index out of bounds ("
352  "length = %1, index = %2).",
353  str.length(), index
354  ).c_str());
355  }
356 
357  if( count == 0)
358  {
359  int cnt = 0;
360  int ret = ::pcre_fullinfo(m_pcre, NULL,
361  PCRE_INFO_CAPTURECOUNT, &cnt);
362  if( ret)
363  {
364  m_error = getError(m_ecode);
365  return false;
366  }
367  count = cnt > 0 ? cnt + 1 : 1;
368  }
369  int vsub[count * 3];
370 
371  sub.clear();
372  m_ecode = ::pcre_exec(m_pcre, NULL, str.c_str(), str.length(),
373  index, eflags, vsub, count * 3);
374  //
375  // pcre_exec returns 0 if vector too small, negative value
376  // on errors or the number of matches (number of int pairs)
377  //
378  if( m_ecode > 0)
379  {
380  sub.resize(count); // as specified by user
381  for(size_t i = 0, n = 0; i < count; i++, n += 2)
382  {
383  match_t m = { vsub[n], vsub[n+1] };
384 
385  // if user wants more than detected
386  if( i >= (size_t)m_ecode)
387  m.rm_so = m.rm_eo = -1;
388 
389  sub[i] = m;
390  }
391  m_error.erase();
392  return true;
393  }
394  else
395  {
396  m_error = getError(m_ecode);
397  return false;
398  }
399 }
400 
401 
402 // -------------------------------------------------------------------
403 bool
404 PerlRegEx::execute(MatchVector &sub, const String &str,
405  size_t index, size_t count, int eflags)
406 {
407  if( m_pcre == NULL)
408  {
409  BLOCXX_THROW(RegExCompileException,
410  "Regular expression is not compiled");
411  }
412  if( count >= size_t(INT_MAX / 3))
413  {
414  BLOCXX_THROW(AssertionException,
415  "Match count limit exceeded");
416  }
417 
418  if( index > str.length())
419  {
420  BLOCXX_THROW(OutOfBoundsException,
421  Format("String index out of bounds ("
422  "length = %1, index = %2)",
423  str.length(), index
424  ).c_str());
425  }
426 
427  if( count == 0)
428  {
429  int cnt = 0;
430  int ret = ::pcre_fullinfo(m_pcre, NULL,
431  PCRE_INFO_CAPTURECOUNT, &cnt);
432  if( ret)
433  {
434  m_error = getError(m_ecode);
435  return false;
436  }
437  count = cnt > 0 ? cnt + 1 : 1;
438  }
439  int vsub[count * 3];
440 
441  sub.clear();
442  m_ecode = ::pcre_exec(m_pcre, NULL, str.c_str(), str.length(),
443  index, eflags, vsub, count * 3);
444  //
445  // pcre_exec returns 0 if vector too small, negative value
446  // on errors or the number of matches (number of int pairs)
447  //
448  if( m_ecode > 0)
449  {
450  count *= 2;
451  m_ecode *= 2;
452  sub.resize(count); // as specified by user
453  for(size_t i = 0; i < count; i++)
454  {
455  // if user wants more than detected
456  if( i >= (size_t)m_ecode)
457  vsub[i] = -1;
458 
459  sub[i] = vsub[i];
460  }
461  return true;
462  }
463  else
464  {
465  m_error = getError(m_ecode);
466  return false;
467  }
468 }
469 
470 
471 // -------------------------------------------------------------------
473 PerlRegEx::capture(const String &str, size_t index, size_t count, int eflags)
474 {
475  if( m_pcre == NULL)
476  {
477  BLOCXX_THROW(RegExCompileException,
478  "Regular expression is not compiled");
479  }
480 
481  MatchArray rsub;
482  StringArray ssub;
483 
484  bool match = execute(rsub, str, index, count, eflags);
485  if( match)
486  {
487  if( rsub.empty())
488  {
489  BLOCXX_THROW(RegExCompileException,
490  "Non-capturing regular expression");
491  }
492 
493  MatchArray::const_iterator i=rsub.begin();
494  for( ; i != rsub.end(); ++i)
495  {
496  if( i->rm_so >= 0 && i->rm_eo >= 0)
497  {
498  ssub.push_back(str.substring(i->rm_so,
499  i->rm_eo - i->rm_so));
500  }
501  else
502  {
503  ssub.push_back(String(""));
504  }
505  }
506  }
507  else if(m_ecode != PCRE_ERROR_NOMATCH)
508  {
509  BLOCXX_THROW_ERR(RegExExecuteException,
510  errorString().c_str(), m_ecode);
511  }
512  return ssub;
513 }
514 
515 
516 // -------------------------------------------------------------------
517 blocxx::String
518 PerlRegEx::replace(const String &str, const String &rep,
519  bool global, int eflags)
520 {
521  if( m_pcre == NULL)
522  {
523  BLOCXX_THROW(RegExCompileException,
524  "Regular expression is not compiled");
525  }
526 
527  MatchArray rsub;
528  bool match;
529  size_t off = 0;
530  String out = str;
531 
532  do
533  {
534  match = execute(rsub, out, off, 0, eflags);
535  if( match)
536  {
537  if( rsub.empty() ||
538  rsub[0].rm_so < 0 ||
539  rsub[0].rm_eo < 0)
540  {
541  // only if empty (missused as guard).
542  BLOCXX_THROW(RegExCompileException,
543  "Non-capturing regular expression");
544  }
545 
546  String res = substitute_caps(rsub, out, rep);
547 
548  out = out.substring(0, rsub[0].rm_so) +
549  res + out.substring(rsub[0].rm_eo);
550 
551  off = rsub[0].rm_so + res.length();
552  }
553  else if(m_ecode == PCRE_ERROR_NOMATCH)
554  {
555  m_ecode = 0;
556  m_error.erase();
557  }
558  else
559  {
560  BLOCXX_THROW_ERR(RegExExecuteException,
561  errorString().c_str(), m_ecode);
562  }
563  } while(global && match && out.length() > off);
564 
565  return out;
566 }
567 
568 
569 // -------------------------------------------------------------------
571 PerlRegEx::split(const String &str, bool empty, int eflags)
572 {
573  if( m_pcre == NULL)
574  {
575  BLOCXX_THROW(RegExCompileException,
576  "Regular expression is not compiled");
577  }
578 
579  MatchArray rsub;
580  StringArray ssub;
581  bool match;
582  size_t off = 0;
583  size_t len = str.length();
584 
585  do
586  {
587  match = execute(rsub, str, off, 0, eflags);
588  if( match)
589  {
590  if( rsub.empty() ||
591  rsub[0].rm_so < 0 ||
592  rsub[0].rm_eo < 0)
593  {
594  BLOCXX_THROW(RegExCompileException,
595  "Non-capturing regular expression");
596  }
597 
598  if( empty || ((size_t)rsub[0].rm_so > off))
599  {
600  ssub.push_back(str.substring(off,
601  rsub[0].rm_so - off));
602  }
603  off = rsub[0].rm_eo;
604  }
605  else if(m_ecode == PCRE_ERROR_NOMATCH)
606  {
607  String tmp = str.substring(off);
608  if( empty || !tmp.empty())
609  {
610  ssub.push_back(tmp);
611  }
612  m_ecode = 0;
613  m_error.erase();
614  }
615  else
616  {
617  BLOCXX_THROW_ERR(RegExExecuteException,
618  errorString().c_str(), m_ecode);
619  }
620  } while(match && len > off);
621 
622  return ssub;
623 }
624 
625 
626 // -------------------------------------------------------------------
628 PerlRegEx::grep(const StringArray &src, int eflags)
629 {
630  if( m_pcre == NULL)
631  {
632  BLOCXX_THROW(RegExCompileException,
633  "Regular expression is not compiled");
634  }
635 
636  m_ecode = 0;
637  m_error.erase();
638 
639  StringArray out;
640  if( !src.empty())
641  {
642  StringArray::const_iterator i=src.begin();
643  for( ; i != src.end(); ++i)
644  {
645  int ret = ::pcre_exec(m_pcre, NULL, i->c_str(),
646  i->length(), 0, eflags, NULL, 0);
647  if( ret >= 0)
648  {
649  out.push_back(*i);
650  }
651  else if( ret != PCRE_ERROR_NOMATCH)
652  {
653  m_ecode = ret;
654  m_error = getError(m_ecode);
655  BLOCXX_THROW_ERR(RegExExecuteException,
656  errorString().c_str(), m_ecode);
657  }
658  }
659  }
660  return out;
661 }
662 
663 
664 // -------------------------------------------------------------------
665 bool
666 PerlRegEx::match(const String &str, size_t index, int eflags) const
667 {
668  if( m_pcre == NULL)
669  {
670  BLOCXX_THROW(RegExCompileException,
671  "Regular expression is not compiled");
672  }
673 
674  if( index > str.length())
675  {
676  BLOCXX_THROW(OutOfBoundsException,
677  Format("String index out of bounds."
678  "length = %1, index = %2",
679  str.length(), index
680  ).c_str());
681  }
682 
683  m_ecode = ::pcre_exec(m_pcre, NULL, str.c_str(),
684  str.length(), 0, eflags, NULL, 0);
685  if( m_ecode >= 0)
686  {
687  m_error.erase();
688  return true;
689  }
690  else if( m_ecode == PCRE_ERROR_NOMATCH)
691  {
692  m_error = getError(m_ecode);
693  return false;
694  }
695  else
696  {
697  m_error = getError(m_ecode);
698  BLOCXX_THROW_ERR(RegExExecuteException,
699  errorString().c_str(), m_ecode);
700  }
701 }
702 
703 
704 // -------------------------------------------------------------------
705 } // namespace BLOCXX_NAMESPACE
706 
707 #endif // BLOCXX_HAVE_PCRE_H
708 #endif // BLOCXX_HAVE_PCRE
709 
710 /* vim: set ts=8 sts=8 sw=8 ai noet: */
711