W3cubDocs

/Nim

Module unicode

This module provides support to handle the Unicode UTF-8 encoding.

Types

Rune = distinct RuneImpl
type that can hold any Unicode character
Rune16 = distinct int16
16 bit Unicode character

Procs

proc `<=%`(a, b: Rune): bool {.raises: [], tags: [].}
proc `<%`(a, b: Rune): bool {.raises: [], tags: [].}
proc `==`(a, b: Rune): bool {.raises: [], tags: [].}
proc runeLen(s: string): int {.gcsafe, extern: "nuc$1", raises: [], tags: [].}
Returns the number of Unicode characters of the string s
proc runeLenAt(s: string; i: Natural): int {.raises: [], tags: [].}
Returns the number of bytes the rune starting at s[i] takes
proc validateUtf8(s: string): int {.raises: [], tags: [].}
Returns the position of the invalid byte in s if the string s does not hold valid UTF-8 data. Otherwise -1 is returned.
proc runeAt(s: string; i: Natural): Rune {.raises: [], tags: [].}
Returns the unicode character in s at byte index i
proc toUTF8(c: Rune): string {.gcsafe, extern: "nuc$1", raises: [], tags: [].}
Converts a rune into its UTF-8 representation
proc `$`(rune: Rune): string {.raises: [], tags: [].}
Converts a Rune to a string
proc `$`(runes: seq[Rune]): string {.raises: [], tags: [].}
Converts a sequence of Runes to a string
proc runeOffset(s: string; pos: Natural; start: Natural = 0): int {.raises: [], tags: [].}

Returns the byte position of unicode character at position pos in s with an optional start byte position. returns the special value -1 if it runs out of the string

Beware: This can lead to unoptimized code and slow execution! Most problems are solve more efficient by using an iterator or conversion to a seq of Rune.

proc runeAtPos(s: string; pos: int): Rune {.raises: [], tags: [].}

Returns the unicode character at position pos

Beware: This can lead to unoptimized code and slow execution! Most problems are solve more efficient by using an iterator or conversion to a seq of Rune.

proc runeStrAtPos(s: string; pos: Natural): string {.raises: [], tags: [].}

Returns the unicode character at position pos as UTF8 String

Beware: This can lead to unoptimized code and slow execution! Most problems are solve more efficient by using an iterator or conversion to a seq of Rune.

proc runeReverseOffset(s: string; rev: Positive): (int, int) {.raises: [], tags: [].}

Returns a tuple with the the byte offset of the unicode character at position rev in s counting from the end (starting with 1) and the total number of runes in the string. Returns a negative value for offset if there are to few runes in the string to satisfy the request.

Beware: This can lead to unoptimized code and slow execution! Most problems are solve more efficient by using an iterator or conversion to a seq of Rune.

proc runeSubStr(s: string; pos: int; len: int = int.high): string {.raises: [], tags: [].}

Returns the UTF-8 substring starting at codepoint pos with len codepoints. If pos or len is negativ they count from the end of the string. If len is not given it means the longest possible string.

(Needs some examples)

proc toLower(c: Rune): Rune {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Converts c into lower case. This works for any Unicode character. If possible, prefer toLower over toUpper.
proc toUpper(c: Rune): Rune {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Converts c into upper case. This works for any Unicode character. If possible, prefer toLower over toUpper.
proc toTitle(c: Rune): Rune {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Converts c to title case
proc isLower(c: Rune): bool {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Returns true iff c is a lower case Unicode character. If possible, prefer isLower over isUpper.
proc isUpper(c: Rune): bool {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Returns true iff c is a upper case Unicode character. If possible, prefer isLower over isUpper.
proc isAlpha(c: Rune): bool {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Returns true iff c is an alpha Unicode character (i.e., a letter)
proc isTitle(c: Rune): bool {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Returns true iff c is a Unicode titlecase character
proc isWhiteSpace(c: Rune): bool {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Returns true iff c is a Unicode whitespace character
proc isCombining(c: Rune): bool {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Returns true iff c is a Unicode combining character
proc isUpper(s: string): bool {.noSideEffect, procvar, gcsafe, extern: "nuc$1Str",
                            raises: [], tags: [].}
Returns true iff s contains all upper case unicode characters.
proc isLower(s: string): bool {.noSideEffect, procvar, gcsafe, extern: "nuc$1Str",
                            raises: [], tags: [].}
Returns true iff s contains all lower case unicode characters.
proc isAlpha(s: string): bool {.noSideEffect, procvar, gcsafe, extern: "nuc$1Str",
                            raises: [], tags: [].}
Returns true iff s contains all alphabetic unicode characters.
proc isSpace(s: string): bool {.noSideEffect, procvar, gcsafe, extern: "nuc$1Str",
                            raises: [], tags: [].}
Returns true iff s contains all whitespace unicode characters.
proc toUpper(s: string): string {.noSideEffect, procvar, gcsafe, extern: "nuc$1Str",
                              raises: [], tags: [].}
Converts s into upper-case unicode characters.
proc toLower(s: string): string {.noSideEffect, procvar, gcsafe, extern: "nuc$1Str",
                              raises: [], tags: [].}
Converts s into lower-case unicode characters.
proc swapCase(s: string): string {.noSideEffect, procvar, gcsafe, extern: "nuc$1",
                               raises: [], tags: [].}

Swaps the case of unicode characters in s

Returns a new string such that the cases of all unicode characters are swapped if possible

proc capitalize(s: string): string {.noSideEffect, procvar, gcsafe, extern: "nuc$1",
                                 raises: [], tags: [].}
Converts the first character of s into an upper-case unicode character.
proc translate(s: string; replacements: proc (key: string): string): string {.gcsafe,
    extern: "nuc$1", raises: [], tags: [].}

Translates words in a string using the replacements proc to substitute words inside s with their replacements

replacements is any proc that takes a word and returns a new word to fill it's place.

proc title(s: string): string {.noSideEffect, procvar, gcsafe, extern: "nuc$1",
                            raises: [], tags: [].}

Converts s to a unicode title.

Returns a new string such that the first character in each word inside s is capitalized

proc isTitle(s: string): bool {.noSideEffect, procvar, gcsafe, extern: "nuc$1Str",
                            raises: [], tags: [].}

Checks whether or not s is a unicode title.

Returns true if the first character in each word inside s are upper case and there is at least one character in s.

proc toRunes(s: string): seq[Rune] {.raises: [], tags: [].}
Obtains a sequence containing the Runes in s
proc cmpRunesIgnoreCase(a, b: string): int {.gcsafe, extern: "nuc$1", procvar,
                                        raises: [], tags: [].}
Compares two UTF-8 strings and ignores the case. Returns:

0 iff a == b
< 0 iff a < b
> 0 iff a > b

proc reversed(s: string): string {.raises: [], tags: [].}
Returns the reverse of s, interpreting it as Unicode characters. Unicode combining characters are correctly interpreted as well:
assert reversed("Reverse this!") == "!siht esreveR"
assert reversed("先秦兩漢") == "漢兩秦先"
assert reversed("as⃝df̅") == "f̅ds⃝a"
assert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞"
proc graphemeLen(s: string; i: Natural): Natural {.raises: [], tags: [].}
The number of bytes belonging to 's[i]' including following combining characters.
proc lastRune(s: string; last: int): (Rune, int) {.raises: [], tags: [].}
length of the last rune in 's[0..last]'. Returns the rune and its length in bytes.

Iterators

iterator runes(s: string): Rune {.raises: [], tags: [].}
Iterates over any unicode character of the string s returning runes
iterator utf8(s: string): string {.raises: [], tags: [].}
Iterates over any unicode character of the string s returning utf8 values

Templates

template fastRuneAt(s: string; i: int; result: untyped; doInc = true)
Returns the Unicode character s[i] in result. If doInc == true i is incremented by the number of bytes that have been processed.
template fastToUTF8Copy(c: Rune; s: var string; pos: int; doInc = true)

Copies UTF-8 representation of c into the preallocated string s starting at position pos. If doInc == true, pos is incremented by the number of bytes that have been processed.

To be the most efficient, make sure s is preallocated with an additional amount equal to the byte length of c.

© 2006–2017 Andreas Rumpf
Licensed under the MIT License.
https://nim-lang.org/docs/unicode.html