This module provides support to handle the Unicode UTF-8 encoding.
Rune = distinct RuneImpl
Rune16 = distinct int16
proc `<=%`(a, b: Rune): bool {.raises: [], tags: [].}
proc `<%`(a, b: Rune): bool {.raises: [], tags: [].}
proc `==`(a, b: Rune): bool {.raises: [], tags: [].}
proc runeLen(s: string): int {.gcsafe, extern: "nuc$1", raises: [], tags: [].}
s
proc runeLenAt(s: string; i: Natural): int {.raises: [], tags: [].}
s[i]
takes proc validateUtf8(s: string): int {.raises: [], tags: [].}
s
if the string s
does not hold valid UTF-8 data. Otherwise -1
is returned. proc runeAt(s: string; i: Natural): Rune {.raises: [], tags: [].}
s
at byte index i
proc toUTF8(c: Rune): string {.gcsafe, extern: "nuc$1", raises: [], tags: [].}
proc `$`(rune: Rune): string {.raises: [], tags: [].}
proc `$`(runes: seq[Rune]): string {.raises: [], tags: [].}
proc runeOffset(s: string; pos: Natural; start: Natural = 0): int {.raises: [], tags: [].}
Returns the byte position of unicode character at position pos in s with an optional start byte position. returns the special value -1 if it runs out of the string
Beware: This can lead to unoptimized code and slow execution! Most problems are solve more efficient by using an iterator or conversion to a seq of Rune.
proc runeAtPos(s: string; pos: int): Rune {.raises: [], tags: [].}
Returns the unicode character at position pos
Beware: This can lead to unoptimized code and slow execution! Most problems are solve more efficient by using an iterator or conversion to a seq of Rune.
proc runeStrAtPos(s: string; pos: Natural): string {.raises: [], tags: [].}
Returns the unicode character at position pos as UTF8 String
Beware: This can lead to unoptimized code and slow execution! Most problems are solve more efficient by using an iterator or conversion to a seq of Rune.
proc runeReverseOffset(s: string; rev: Positive): (int, int) {.raises: [], tags: [].}
Returns a tuple with the the byte offset of the unicode character at position rev
in s counting from the end (starting with 1) and the total number of runes in the string. Returns a negative value for offset if there are to few runes in the string to satisfy the request.
Beware: This can lead to unoptimized code and slow execution! Most problems are solve more efficient by using an iterator or conversion to a seq of Rune.
proc runeSubStr(s: string; pos: int; len: int = int.high): string {.raises: [], tags: [].}
Returns the UTF-8 substring starting at codepoint pos with len codepoints. If pos or len is negativ they count from the end of the string. If len is not given it means the longest possible string.
(Needs some examples)
proc toLower(c: Rune): Rune {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
c
into lower case. This works for any Unicode character. If possible, prefer toLower
over toUpper
. proc toUpper(c: Rune): Rune {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
c
into upper case. This works for any Unicode character. If possible, prefer toLower
over toUpper
. proc toTitle(c: Rune): Rune {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
c
to title case proc isLower(c: Rune): bool {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
c
is a lower case Unicode character. If possible, prefer isLower
over isUpper
. proc isUpper(c: Rune): bool {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
c
is a upper case Unicode character. If possible, prefer isLower
over isUpper
. proc isAlpha(c: Rune): bool {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
c
is an alpha Unicode character (i.e., a letter) proc isTitle(c: Rune): bool {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
c
is a Unicode titlecase character proc isWhiteSpace(c: Rune): bool {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
c
is a Unicode whitespace character proc isCombining(c: Rune): bool {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
c
is a Unicode combining character proc isUpper(s: string): bool {.noSideEffect, procvar, gcsafe, extern: "nuc$1Str", raises: [], tags: [].}
proc isLower(s: string): bool {.noSideEffect, procvar, gcsafe, extern: "nuc$1Str", raises: [], tags: [].}
proc isAlpha(s: string): bool {.noSideEffect, procvar, gcsafe, extern: "nuc$1Str", raises: [], tags: [].}
proc isSpace(s: string): bool {.noSideEffect, procvar, gcsafe, extern: "nuc$1Str", raises: [], tags: [].}
proc toUpper(s: string): string {.noSideEffect, procvar, gcsafe, extern: "nuc$1Str", raises: [], tags: [].}
proc toLower(s: string): string {.noSideEffect, procvar, gcsafe, extern: "nuc$1Str", raises: [], tags: [].}
proc swapCase(s: string): string {.noSideEffect, procvar, gcsafe, extern: "nuc$1", raises: [], tags: [].}
Swaps the case of unicode characters in s
Returns a new string such that the cases of all unicode characters are swapped if possible
proc capitalize(s: string): string {.noSideEffect, procvar, gcsafe, extern: "nuc$1", raises: [], tags: [].}
proc translate(s: string; replacements: proc (key: string): string): string {.gcsafe, extern: "nuc$1", raises: [], tags: [].}
Translates words in a string using the replacements proc to substitute words inside s with their replacements
replacements is any proc that takes a word and returns a new word to fill it's place.
proc title(s: string): string {.noSideEffect, procvar, gcsafe, extern: "nuc$1", raises: [], tags: [].}
Converts s to a unicode title.
Returns a new string such that the first character in each word inside s is capitalized
proc isTitle(s: string): bool {.noSideEffect, procvar, gcsafe, extern: "nuc$1Str", raises: [], tags: [].}
Checks whether or not s is a unicode title.
Returns true if the first character in each word inside s are upper case and there is at least one character in s.
proc toRunes(s: string): seq[Rune] {.raises: [], tags: [].}
s
proc cmpRunesIgnoreCase(a, b: string): int {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
0 iff a == b
< 0 iff a < b
> 0 iff a > b
proc reversed(s: string): string {.raises: [], tags: [].}
s
, interpreting it as Unicode characters. Unicode combining characters are correctly interpreted as well:assert reversed("Reverse this!") == "!siht esreveR" assert reversed("先秦兩漢") == "漢兩秦先" assert reversed("as⃝df̅") == "f̅ds⃝a" assert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞"
proc graphemeLen(s: string; i: Natural): Natural {.raises: [], tags: [].}
proc lastRune(s: string; last: int): (Rune, int) {.raises: [], tags: [].}
iterator runes(s: string): Rune {.raises: [], tags: [].}
s
returning runes iterator utf8(s: string): string {.raises: [], tags: [].}
s
returning utf8 values template fastRuneAt(s: string; i: int; result: untyped; doInc = true)
s[i]
in result
. If doInc == true
i
is incremented by the number of bytes that have been processed. template fastToUTF8Copy(c: Rune; s: var string; pos: int; doInc = true)
Copies UTF-8 representation of c into the preallocated string s starting at position pos. If doInc == true, pos is incremented by the number of bytes that have been processed.
To be the most efficient, make sure s is preallocated with an additional amount equal to the byte length of c.
© 2006–2017 Andreas Rumpf
Licensed under the MIT License.
https://nim-lang.org/docs/unicode.html