A regular expression library for Nim using PCRE to do the hard work.
Note: If you love sequtils.toSeq
we have bad news for you. This library doesn't work with it due to documented compiler limitations. As a workaround, use this:
import nre except toSeq
PCRE has some additional terms that you must agree to in order to use this module.
import nre let vowels = re"[aeoui]" for match in "moigagoo".findIter(vowels): echo match.matchBounds # (a: 1, b: 1) # (a: 2, b: 2) # (a: 4, b: 4) # (a: 6, b: 6) # (a: 7, b: 7) let firstVowel = "foo".find(vowels) let hasVowel = firstVowel.isSome() if hasVowel: let matchBounds = firstVowel.get().captureBounds[-1] echo "first vowel @", matchBounds.get().a # first vowel @1
Regex = ref object pattern*: string ## not nil pcreObj: ptr pcre.Pcre ## not nil pcreExtra: ptr pcre.ExtraData ## nil captureNameToId: Table[string, int]
re(string)
. Examples: re"foo"
, re(r"(*ANYCRLF)(?x)foo # comment".
pattern: string
captureCount: int
captureNameId: Table[string, int]
The following options may appear anywhere in the pattern, and they affect the rest of it.
(?i)
- case insensitive(?m)
- multi-line: ^
and $
match the beginning and end of lines, not of the subject string(?s)
- .
also matches newline (dotall)(?U)
- expressions are not greedy by default. ?
can be added to a qualifier to make it greedy(?x)
- whitespace and comments (#
) are ignored (extended)(?X)
- character escapes without special meaning (\w
vs. \a
) are errors (extra)One or a combination of these options may appear only at the beginning of the pattern:
(*UTF8)
- treat both the pattern and subject as UTF-8(*UCP)
- Unicode character properties; \w
matches я
(*U)
- a combination of the two options above(*FIRSTLINE*)
- fails if there is not a match on the first line(*NO_AUTO_CAPTURE)
- turn off auto-capture for groups; (?<name>...)
can be used to capture(*CR)
- newlines are separated by \r
(*LF)
- newlines are separated by \n
(UNIX default)(*CRLF)
- newlines are separated by \r\n
(Windows default)(*ANYCRLF)
- newlines are separated by any of the above(*ANY)
- newlines are separated by any of the above and Unicode newlines:single characters VT (vertical tab, U+000B), FF (form feed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS (paragraph separator, U+2029). For the 8-bit library, the last two are recognized only in UTF-8 mode. — man pcre
(*JAVASCRIPT_COMPAT)
- JavaScript compatibility(*NO_STUDY)
- turn off studying; study is enabled by defaultFor more details on the leading option groups, see the Option Setting and the Newline Convention sections of the PCRE syntax manual.
RegexMatch = object pattern*: Regex ## The regex doing the matching. ## Not nil. str*: string ## The string that was matched against. ## Not nil. pcreMatchBounds: seq[Slice[cint]] ## First item is the bounds of the match ## Other items are the captures ## `a` is inclusive start, `b` is exclusive end
pattern: Regex
str: string
captures[]: string
-1
, then the whole match is returned. If the given capture was not matched, nil
is returned."abc".match(re"(\w)").captures[0] == "a"
"abc".match(re"(?<letter>\w)").captures["letter"] == "a"
"abc".match(re"(\w)\w").captures[-1] == "ab"
captureBounds[]: Option[Slice[int]]
None
is returned. The bounds are both inclusive."abc".match(re"(\w)").captureBounds[0] == 0 .. 0
"abc".match(re"").captureBounds[-1] == 0 .. -1
"abc".match(re"abc").captureBounds[-1] == 0 .. 2
match: string
matchBounds: Slice[int]
captureBounds[]
(captureBounds|captures).toTable
(captureBounds|captures).toSeq
$: string
match
Captures = distinct RegexMatch
CaptureBounds = distinct RegexMatch
RegexError = ref object of Exception
RegexInternalError = ref object of RegexError
InvalidUnicodeError = ref object of RegexError pos*: int ## the location of the invalid unicode in bytes
SyntaxError = ref object of RegexError pos*: int ## the location of the syntax error in bytes pattern*: string ## the pattern that caused the problem
StudyError = ref object of RegexError
proc captureCount(pattern: Regex): int {.raises: [FieldError, ValueError], tags: [].}
proc captureNameId(pattern: Regex): Table[string, int] {.raises: [], tags: [].}
proc captureBounds(pattern: RegexMatch): CaptureBounds {.raises: [], tags: [].}
proc captures(pattern: RegexMatch): Captures {.raises: [], tags: [].}
proc `[]`(pattern: CaptureBounds; i: int): Option[Slice[int]] {.raises: [], tags: [].}
proc `[]`(pattern: Captures; i: int): string {.raises: [UnpackError], tags: [].}
proc match(pattern: RegexMatch): string {.raises: [UnpackError], tags: [].}
proc matchBounds(pattern: RegexMatch): Slice[int] {.raises: [UnpackError], tags: [].}
proc `[]`(pattern: CaptureBounds; name: string): Option[Slice[int]] {. raises: [KeyError], tags: [].}
proc `[]`(pattern: Captures; name: string): string {.raises: [UnpackError, KeyError], tags: [].}
proc toTable(pattern: Captures; default: string = nil): Table[string, string] {. raises: [UnpackError, KeyError], tags: [].}
proc toTable(pattern: CaptureBounds; default = none(Slice[int])): Table[string, Option[Slice[int]]] {.raises: [KeyError], tags: [].}
proc toSeq(pattern: CaptureBounds; default = none(Slice[int])): seq[Option[Slice[int]]] {. raises: [FieldError, ValueError], tags: [].}
proc toSeq(pattern: Captures; default: string = nil): seq[string] {. raises: [FieldError, ValueError, UnpackError], tags: [].}
proc `$`(pattern: RegexMatch): string {.raises: [UnpackError], tags: [].}
proc `==`(a, b: Regex): bool {.raises: [], tags: [].}
proc `==`(a, b: RegexMatch): bool {.raises: [], tags: [].}
proc re(pattern: string): Regex {.raises: [KeyError, SyntaxError, StudyError, FieldError, ValueError], tags: [].}
proc match(str: string; pattern: Regex; start = 0; endpos = int.high): Option[RegexMatch] {.raises: [ FieldError, ValueError, AssertionError, AccessViolationError, RegexInternalError, InvalidUnicodeError], tags: [].}
"foo".match(re"f") == true
, but "foo".match(re"o") == false
. proc find(str: string; pattern: Regex; start = 0; endpos = int.high): Option[RegexMatch] {.raises: [ FieldError, ValueError, AssertionError, AccessViolationError, RegexInternalError, InvalidUnicodeError], tags: [].}
start
|abc
is 0
; a|bc
is 1
endpos
int.high
means the end of the string, otherwise it’s an inclusive upper bound.proc findAll(str: string; pattern: Regex; start = 0; endpos = int.high): seq[string] {.raises: [ FieldError, ValueError, UnpackError, AssertionError, AccessViolationError, RegexInternalError, InvalidUnicodeError], tags: [].}
proc contains(str: string; pattern: Regex; start = 0; endpos = int.high): bool {.raises: [ FieldError, ValueError, AssertionError, AccessViolationError, RegexInternalError, InvalidUnicodeError], tags: [].}
Same as isSome(str.find(pattern, start, endpos))
.
proc split(str: string; pattern: Regex; maxSplit = - 1; start = 0): seq[string] {.raises: [ FieldError, ValueError, UnpackError, AssertionError, AccessViolationError, RegexInternalError, InvalidUnicodeError], tags: [].}
"123".split(r"") == @["1", "2", "3"]
."12".split(re"(\d)") == @["", "1", "", "2", ""]
.maxsplit != -1
, then the string will only be split maxsplit - 1
times. This means that there will be maxsplit
strings in the output seq. "1.2.3".split(re"\.", maxsplit = 2) == @["1", "2.3"]
start
behaves the same as in ```find(...)`` <#proc-find>`_.
proc replace(str: string; pattern: Regex; subproc: proc (match: RegexMatch): string): string {.raises: [ FieldError, ValueError, UnpackError, AssertionError, AccessViolationError, RegexInternalError, InvalidUnicodeError], tags: [].}
Replaces each match of Regex in the string with sub
, which should never be or return nil
.
If sub
is a proc (RegexMatch): string
, then it is executed with each match and the return value is the replacement value.
If sub
is a proc (string): string
, then it is executed with the full text of the match and and the return value is the replacement value.
If sub
is a string, the syntax is as follows:
$$
- literal $
$123
- capture number 123
$foo
- named capture foo
${foo}
- same as above$1$#
- first and second captures$#
- first capture$0
- full matchIf a given capture is missing, a ValueError
exception is thrown.
proc replace(str: string; pattern: Regex; subproc: proc (match: string): string): string {.raises: [ FieldError, ValueError, UnpackError, AssertionError, AccessViolationError, RegexInternalError, InvalidUnicodeError], tags: [].}
proc replace(str: string; pattern: Regex; sub: string): string {.raises: [FieldError, ValueError, UnpackError, AssertionError, AccessViolationError, RegexInternalError, InvalidUnicodeError, KeyError, Exception], tags: [].}
proc escapeRe(str: string): string {.raises: [FieldError, ValueError, UnpackError, AssertionError, AccessViolationError, RegexInternalError, InvalidUnicodeError, KeyError, Exception], tags: [].}
X
). iterator items(pattern: CaptureBounds; default = none(Slice[int])): Option[Slice[int]] {. raises: [FieldError, ValueError], tags: [].}
iterator items(pattern: Captures; default: string = nil): string {. raises: [FieldError, ValueError, UnpackError], tags: [].}
iterator findIter(str: string; pattern: Regex; start = 0; endpos = int.high): RegexMatch {.raises: [ FieldError, ValueError, UnpackError, AssertionError, AccessViolationError, RegexInternalError, InvalidUnicodeError], tags: [].}
Works the same as ```find(...)`` <#proc-find>`_, but finds every non-overlapping match. "2222".find(re"22")
is "22", "22"
, not "22", "22", "22"
.
Arguments are the same as ```find(...)`` <#proc-find>`_
Variants:
proc findAll(...)
returns a seq[string]
© 2006–2017 Andreas Rumpf
Licensed under the MIT License.
https://nim-lang.org/docs/nre.html