This module parses an HTML document and creates its XML tree representation. It is supposed to handle the wild HTML the real world uses.
It can be used to parse a wild HTML document and output it as valid XHTML document (well, if you are lucky):
echo loadHtml("mydirty.html")
Every tag in the resulting tree is in lower case.
Note: The resulting XmlNode
already uses the clientData
field, so it cannot be used by clients of this library.
This code demonstrates how you can iterate over all the tags in an HTML file and write back the modified version. In this case we look for hyperlinks ending with the extension .rst
and convert them to .html
.
import htmlparser import xmltree # To use '$' for XmlNode import strtabs # To access XmlAttributes import os # To use splitFile import strutils # To use cmpIgnoreCase proc transformHyperlinks() = let html = loadHTML("input.html") for a in html.findAll("a"): let href = a.attrs["href"] if not href.isNil: let (dir, filename, ext) = splitFile(href) if cmpIgnoreCase(ext, ".rst") == 0: a.attrs["href"] = dir / filename & ".html" writeFile("output.html", $html)
HtmlTag = enum tagUnknown, ## unknown HTML element tagA, ## the HTML ``a`` element tagAbbr, ## the deprecated HTML ``abbr`` element tagAcronym, ## the HTML ``acronym`` element tagAddress, ## the HTML ``address`` element tagApplet, ## the deprecated HTML ``applet`` element tagArea, ## the HTML ``area`` element tagArticle, ## the HTML ``article`` element tagAside, ## the HTML ``aside`` element tagAudio, ## the HTML ``audio`` element tagB, ## the HTML ``b`` element tagBase, ## the HTML ``base`` element tagBdi, ## the HTML ``bdi`` element tagBdo, ## the deprecated HTML ``dbo`` element tagBasefont, ## the deprecated HTML ``basefont`` element tagBig, ## the HTML ``big`` element tagBlockquote, ## the HTML ``blockquote`` element tagBody, ## the HTML ``body`` element tagBr, ## the HTML ``br`` element tagButton, ## the HTML ``button`` element tagCanvas, ## the HTML ``canvas`` element tagCaption, ## the HTML ``caption`` element tagCenter, ## the deprecated HTML ``center`` element tagCite, ## the HTML ``cite`` element tagCode, ## the HTML ``code`` element tagCol, ## the HTML ``col`` element tagColgroup, ## the HTML ``colgroup`` element tagCommand, ## the HTML ``command`` element tagDatalist, ## the HTML ``datalist`` element tagDd, ## the HTML ``dd`` element tagDel, ## the HTML ``del`` element tagDetails, ## the HTML ``details`` element tagDfn, ## the HTML ``dfn`` element tagDialog, ## the HTML ``dialog`` element tagDiv, ## the HTML ``div`` element tagDir, ## the deprecated HTLM ``dir`` element tagDl, ## the HTML ``dl`` element tagDt, ## the HTML ``dt`` element tagEm, ## the HTML ``em`` element tagEmbed, ## the HTML ``embed`` element tagFieldset, ## the HTML ``fieldset`` element tagFigcaption, ## the HTML ``figcaption`` element tagFigure, ## the HTML ``figure`` element tagFont, ## the deprecated HTML ``font`` element tagFooter, ## the HTML ``footer`` element tagForm, ## the HTML ``form`` element tagFrame, ## the HTML ``frame`` element tagFrameset, ## the deprecated HTML ``frameset`` element tagH1, ## the HTML ``h1`` element tagH2, ## the HTML ``h2`` element tagH3, ## the HTML ``h3`` element tagH4, ## the HTML ``h4`` element tagH5, ## the HTML ``h5`` element tagH6, ## the HTML ``h6`` element tagHead, ## the HTML ``head`` element tagHeader, ## the HTML ``header`` element tagHgroup, ## the HTML ``hgroup`` element tagHtml, ## the HTML ``html`` element tagHr, ## the HTML ``hr`` element tagI, ## the HTML ``i`` element tagIframe, ## the deprecated HTML ``iframe`` element tagImg, ## the HTML ``img`` element tagInput, ## the HTML ``input`` element tagIns, ## the HTML ``ins`` element tagIsindex, ## the deprecated HTML ``isindex`` element tagKbd, ## the HTML ``kbd`` element tagKeygen, ## the HTML ``keygen`` element tagLabel, ## the HTML ``label`` element tagLegend, ## the HTML ``legend`` element tagLi, ## the HTML ``li`` element tagLink, ## the HTML ``link`` element tagMap, ## the HTML ``map`` element tagMark, ## the HTML ``mark`` element tagMenu, ## the deprecated HTML ``menu`` element tagMeta, ## the HTML ``meta`` element tagMeter, ## the HTML ``meter`` element tagNav, ## the HTML ``nav`` element tagNobr, ## the deprecated HTML ``nobr`` element tagNoframes, ## the deprecated HTML ``noframes`` element tagNoscript, ## the HTML ``noscript`` element tagObject, ## the HTML ``object`` element tagOl, ## the HTML ``ol`` element tagOptgroup, ## the HTML ``optgroup`` element tagOption, ## the HTML ``option`` element tagOutput, ## the HTML ``output`` element tagP, ## the HTML ``p`` element tagParam, ## the HTML ``param`` element tagPre, ## the HTML ``pre`` element tagProgress, ## the HTML ``progress`` element tagQ, ## the HTML ``q`` element tagRp, ## the HTML ``rp`` element tagRt, ## the HTML ``rt`` element tagRuby, ## the HTML ``ruby`` element tagS, ## the deprecated HTML ``s`` element tagSamp, ## the HTML ``samp`` element tagScript, ## the HTML ``script`` element tagSection, ## the HTML ``section`` element tagSelect, ## the HTML ``select`` element tagSmall, ## the HTML ``small`` element tagSource, ## the HTML ``source`` element tagSpan, ## the HTML ``span`` element tagStrike, ## the deprecated HTML ``strike`` element tagStrong, ## the HTML ``strong`` element tagStyle, ## the HTML ``style`` element tagSub, ## the HTML ``sub`` element tagSummary, ## the HTML ``summary`` element tagSup, ## the HTML ``sup`` element tagTable, ## the HTML ``table`` element tagTbody, ## the HTML ``tbody`` element tagTd, ## the HTML ``td`` element tagTextarea, ## the HTML ``textarea`` element tagTfoot, ## the HTML ``tfoot`` element tagTh, ## the HTML ``th`` element tagThead, ## the HTML ``thead`` element tagTime, ## the HTML ``time`` element tagTitle, ## the HTML ``title`` element tagTr, ## the HTML ``tr`` element tagTrack, ## the HTML ``track`` element tagTt, ## the HTML ``tt`` element tagU, ## the deprecated HTML ``u`` element tagUl, ## the HTML ``ul`` element tagVar, ## the HTML ``var`` element tagVideo, ## the HTML ``video`` element tagWbr ## the HTML ``wbr`` element
tagToStr = ["a", "abbr", "acronym", "address", "applet", "area", "article", "aside", "audio", "b", "base", "basefont", "bdi", "bdo", "big", "blockquote", "body", "br", "button", "canvas", "caption", "center", "cite", "code", "col", "colgroup", "command", "datalist", "dd", "del", "details", "dfn", "dialog", "div", "dir", "dl", "dt", "em", "embed", "fieldset", "figcaption", "figure", "font", "footer", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "html", "hr", "i", "iframe", "img", "input", "ins", "isindex", "kbd", "keygen", "label", "legend", "li", "link", "map", "mark", "menu", "meta", "meter", "nav", "nobr", "noframes", "noscript", "object", "ol", "optgroup", "option", "output", "p", "param", "pre", "progress", "q", "rp", "rt", "ruby", "s", "samp", "script", "section", "select", "small", "source", "span", "strike", "strong", "style", "sub", "summary", "sup", "table", "tbody", "td", "textarea", "tfoot", "th", "thead", "time", "title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr"]
InlineTags = {tagA, tagAbbr, tagAcronym, tagApplet, tagB, tagBasefont, tagBdo, tagBig, tagBr, tagButton, tagCite, tagCode, tagDel, tagDfn, tagEm, tagFont, tagI, tagImg, tagIns, tagInput, tagIframe, tagKbd, tagLabel, tagMap, tagObject, tagQ, tagSamp, tagScript, tagSelect, tagSmall, tagSpan, tagStrong, tagSub, tagSup, tagTextarea, tagTt, tagVar, tagApplet, tagBasefont, tagFont, tagIframe, tagU, tagS, tagStrike, tagWbr}
BlockTags = {tagAddress, tagBlockquote, tagCenter, tagDel, tagDir, tagDiv, tagDl, tagFieldset, tagForm, tagH1, tagH2, tagH3, tagH4, tagH5, tagH6, tagHr, tagIns, tagIsindex, tagMenu, tagNoframes, tagNoscript, tagOl, tagP, tagPre, tagTable, tagUl, tagCenter, tagDir, tagIsindex, tagMenu, tagNoframes}
SingleTags = {tagArea, tagBase, tagBasefont, tagBr, tagCol, tagFrame, tagHr, tagImg, tagIsindex, tagLink, tagMeta, tagParam, tagWbr}
proc htmlTag(n: XmlNode): HtmlTag {.raises: [], tags: [].}
HtmlTag
. proc htmlTag(s: string): HtmlTag {.raises: [], tags: [].}
HtmlTag
. If s is no HTML tag, tagUnknown
is returned. proc entityToUtf8(entity: string): string {.raises: [], tags: [].}
Ü
to its UTF-8 equivalent. "" is returned if the entity name is unknown. The HTML parser already converts entities to UTF-8. proc parseHtml(s: Stream; filename: string; errors: var seq[string]): XmlNode {. raises: [Exception, ValueError], tags: [ReadIOEffect, RootEffect].}
PXmlNode
. Every occurred parsing error is added to the errors sequence. proc parseHtml(s: Stream): XmlNode {.raises: [Exception, ValueError], tags: [ReadIOEffect, RootEffect].}
PXmlNode
. All parsing errors are ignored. proc loadHtml(path: string; errors: var seq[string]): XmlNode {. raises: [IOError, Exception, ValueError], tags: [ReadIOEffect, RootEffect].}
path
, and returns a PXmlNode
. Every occurred parsing error is added to the errors sequence. proc loadHtml(path: string): XmlNode {.raises: [IOError, Exception, ValueError], tags: [ReadIOEffect, RootEffect].}
path
, and returns a PXmlNode
. All parsing errors are ignored.
© 2006–2017 Andreas Rumpf
Licensed under the MIT License.
https://nim-lang.org/docs/htmlparser.html