#-------------------------------------------------------------------------
#
# SUMMARY:
#
# set doc [hdom parse HTML]
#
# DOCUMENT OBJECT API:
#
# $doc root
# Return the root node of the document.
#
# $doc destroy
# Destroy DOM object
#
# $doc parsenode HTML
# Parse return a new node or nodes.
#
# NODE OBJECT API:
#
# $node tag
# Get or set the nodes tag type. Always lower-case. Empty string
# for text.
#
# $node children
# Return a list of the nodes children.
#
# $node text
# For a text node, return the text. For any other node, return the
# concatenation of the text belonging to all descendent text nodes
# (in document order).
#
# $node parent
# Return the nodes parent node.
#
# $node offset
# Return the byte offset of the node within the document (if any).
#
# $node foreach_descendent VARNAME SCRIPT
# Iterate through all nodes in the sub-tree headed by $node. $node
# itself is not visited.
#
# $node attr ?-default VALUE? ATTR ?NEWVALUE?
#
# $node search PATTERN
# Return a list of descendent nodes that match pattern PATTERN.
#
# $node addChild ?-before BEFORENODE? NEWNODE
# Detach NEWNODE from its current parent and add it as the first
# child of $node. Or, if the -before switch is specified, immediately
# before BEFORENODE.
#
# $node html
# Return HTML code equivalent to the node.
#
# $node detach
# Detach $node from its parent.
#
catch { load ./parsehtml.so }
#-------------------------------------------------------------------------
# Throw an exception if the expression passed as the only argument does
# not evaluate to true.
#
proc assert {condition} {
uplevel [list if "! ($condition)" [list error "assert failed: $condition"]]
}
#--------------------------------------------------------------------------
#
# A parsed HTML document tree is store in a single array object. Each node
# is stored in three array entries:
#
# O($id,tag) ("" for text, tag name for other nodes)
# O($id,detail) (text data for text, key-value attribute list for others)
# O($id,children) (list of child ids)
# O($id,parent) (parent node id)
# O($id,offset) (byte offset within original document text)
#
# Each node is identified by its key in the array. The root node's key is
# stored in O(root). All nodes have automatically generated keys.
#
namespace eval hdom {
variable iNextid 0
# Ignore all tags in the aIgnore[] array.
variable aIgnore
set aIgnore(html) 1
set aIgnore(/html) 1
set aIgnore(!doctype) 1
# All inline tags.
variable aInline
foreach x {
tt i b big small u
em strong dfn code samp kbd var cite abbr acronym
a img object br script map q sub sup span bdo
input select textarea label button tcl yyterm yynonterm
} { set aInline($x) 1 }
# All self-closing tags (set below)
variable aSelfClosing
variable aContentChecker
set aContentChecker(p) HtmlInlineContent
set aContentChecker(th) HtmlTableCellContent
set aContentChecker(td) HtmlTableCellContent
set aContentChecker(tr) HtmlTableRowContent
set aContentChecker(table) HtmlTableContent
set aContentChecker(a) HtmlAnchorContent
set aContentChecker(ul) HtmlUlContent
set aContentChecker(ol) HtmlUlContent
set aContentChecker(menu) HtmlUlContent
set aContentChecker(dir) HtmlUlContent
set aContentChecker(form) HtmlFormContent
set aContentChecker(option) HtmlPcdataContent
set aContentChecker(li) HtmlLiContent
set aContentChecker(dt) HtmlLiContent
set aContentChecker(dd) HtmlLiContent
set aContentChecker(dl) HtmlDlContent
# Add content checkers for all self-closing tags.
foreach x {
area base br hr iframe img input isindex link meta
param script style embed nextid wbr bgsound
} {
set aContentChecker($x) HtmlEmptyContent
set aSelfClosing($x) 1
}
namespace export parse
namespace ensemble create
}
proc ::hdom::nextNodeId {} {
variable iNextid
set res "::hdom::node_$iNextid"
incr iNextid
return $res
}
proc ::hdom::nextDocId {} {
variable iNextid
set res "::hdom::doc_$iNextid"
incr iNextid
return $res
}
# Return "close" if the content is not Ok. Or "parent" if it is ok, but the
# caller should check the parent. Or "ok" if is unconditionally Ok.
#
proc ::hdom::HtmlInlineContent {tag} {
variable aInline
if {$tag == ""} { return "ok" }
if {[info exists aInline($tag)]} { return "parent" }
return "close"
}
proc ::hdom::HtmlEmptyContent {tag} {
return "close"
}
proc ::hdom::HtmlTableCellContent {tag} {
if {$tag == "th" || $tag == "td" || $tag == "tr"} { return "close" }
return "parent"
}
proc ::hdom::HtmlTableRowContent {tag} {
if {$tag == "tr"} { return "close" }
return "parent"
}
proc ::hdom::HtmlTableContent {tag} {
if {$tag == "table"} { return "close" }
return "ok"
}
proc ::hdom::HtmlLiContent {tag} {
if {$tag == ""} { return "ok" }
if {$tag == "li" || $tag=="dd" || $tag=="dt"} { return "close" }
return "parent"
}
proc ::hdom::HtmlAnchorContent {tag} {
if {$tag == ""} { return "ok" }
if {$tag == "a"} { return "close" }
return "parent"
}
proc ::hdom::HtmlUlContent {tag} {
if {$tag == "" || $tag=="li"} { return "ok" }
return "parent"
}
proc ::hdom::HtmlDlContent {tag} {
if {$tag == "dd" || $tag=="dt" || $tag==""} { return "ok" }
return "parent"
}
proc ::hdom::HtmlFormContent {tag} {
if {$tag == "tr" || $tag=="td" || $tag=="th"} { return "close" }
return "parent"
}
proc ::hdom::HtmlPcdataContent {tag} {
if {$tag == ""} { return "parent" }
return "close"
}
proc ::hdom::parsehtml_cb {arrayname tag detail offset endoffset} {
variable aIgnore
variable aContentChecker
upvar $arrayname O
# Fold the tag name to lower-case.
set tag [string tolower $tag]
# Ignore <html> and </html> tags.
if {[info exists aIgnore($tag)]} return
# Ingore <svg>. <svg> comes through parsehtml as if it were a single
# big block of text.
if {$tag=="" && [string match {<svg*} $detail]} return
# An explicit close tag. Search for a tag to close.
if { [string range $tag 0 0]=="/" } {
set match [string range $tag 1 end]
for {set id $O(current)} {$id!=""} {set id $O($id,parent)} {
if {$O($id,tag)==$match} break
}
# The closing tag matches node $id. So the new current node is its parent.
if {$id!=""} {
assert {$id!=$O(root)}
set O(current) $O($id,parent)
}
return
}
# Check for implicit close tags.
if {$tag!=""} {
for {set id $O(current)} {$id!=""} {set id $O($id,parent)} {
set ptag $O($id,tag)
if { [info exists aContentChecker($ptag)] } {
switch -- [$aContentChecker($ptag) $tag] {
"parent" {
# no-op
}
"close" {
# Close tag $id
assert {$id!=$O(root)}
set O(current) $O($id,parent)
}
"ok" {
# Break out of the for(...) loop
break
}
default {
error "content checker $aContentChecker($ptag) failed"
}
}
}
}
}
# Add the new node to the database.
set newid [nextNodeId]
set O($newid,tag) $tag
set O($newid,detail) $detail
set O($newid,children) [list]
set O($newid,parent) $O(current)
set O($newid,offset) $offset
# Link it into its parent's child array.
lappend O($O(current),children) $newid
if {$tag != ""} {
set O(current) $newid
}
}
# Node method [$node tag]
#
proc ::hdom::nm_tag {arrayname id} {
upvar $arrayname O
return $O($id,tag)
}
# Node method [$node parent]
#
proc ::hdom::nm_parent {arrayname id} {
upvar $arrayname O
return [create_node_command $arrayname $O($id,parent)]
}
# Node method [$node children]
#
proc ::hdom::nm_children {arrayname id} {
upvar $arrayname O
foreach c $O($id,children) { create_node_command $arrayname $c }
return $O($id,children)
}
proc ::hdom::foreach_desc {arrayname id varname script level} {
upvar $arrayname O
foreach c $O($id,children) {
create_node_command $arrayname $c
uplevel $level [list set $varname $c]
set rc [catch { uplevel $level $script } msg info]
if {$rc == 0 || $rc == 4} {
# TCL_OK or TCL_CONTINUE Do nothing
} elseif {$rc == 3} {
# TCL_BREAK
return 1
} else {
# TCL_RETURN or TCL_ERROR.
return -options $info
}
if {[foreach_desc $arrayname $c $varname $script [expr $level+1]]} {
return 1
}
}
return 0
}
# Node method [$node foreach_descendent]
#
proc ::hdom::nm_foreach_descendent {arrayname id varname script} {
foreach_desc $arrayname $id $varname $script 2
return ""
}
# Node method [$node text]
#
proc ::hdom::nm_text {arrayname id} {
upvar $arrayname O
if { $O($id,tag)=="" } {
return $O($id,detail)
}
set ret ""
$id foreach_descendent N {
if {[$N tag] == ""} { append ret [$N text] }
}
return $ret
}
# Node method [$node offset]
#
proc ::hdom::nm_offset {arrayname id} {
upvar $arrayname O
return $O($id,offset)
}
# Node method: $node attr ?-default VALUE? ?ATTR?
#
# $node attr
# $node attr ATTR
# $node attr ATTR NEWVALUE
# $node attr -default VALUE ATTR
#
proc ::hdom::nm_attr {arrayname id args} {
upvar $arrayname O
set dict $O($id,detail)
if {[llength $args]==0} { return $dict }
if {[llength $args]==1} {
set nm [lindex $args 0]
if {[catch { set res [dict get $dict $nm] }]} {
error "no such attribute: $nm"
}
}
if {[llength $args]==3} {
if {[lindex $args 0] != "-default"} {
error "expected \"-default\" got \"[lindex $args 0]\""
}
set nm [lindex $args 2]
if {[catch { set res [dict get $dict $nm] }]} {
set res [lindex $args 1]
}
}
if {[llength $args]==2} {
set res [lindex $args 1]
dict set O($id,detail) [lindex $args 0] $res
}
return $res
}
proc ::hdom::nodematches {N pattern} {
set tag [$N tag]
if {[string compare $pattern $tag]==0} {
return 1
}
return 0
}
# Node method: $node search PATTERN
#
proc ::hdom::nm_search {arrayname id pattern} {
set ret [list]
$id foreach_descendent N {
if {[::hdom::nodematches $N $pattern]} {
lappend ret $N
}
}
set ret
}
# Node method: $node html
#
proc ::hdom::nm_html {arrayname id} {
variable aSelfClosing
set res ""
set tag [$id tag]
if {$tag ==""} {
append res [$id text]
} else {
append res "<$tag"
foreach {k v} [$id attr] { append res " $k=\"$v\"" }
append res ">"
foreach N [$id children] {
append res [$N html]
}
if { [info exists aSelfClosing($tag)]==0 } {
append res "</$tag>"
}
}
set res
}
# Node method: $node detach
#
proc ::hdom::nm_detach {arrayname id} {
upvar $arrayname O
set P $O($id,parent)
if {$P!=""} {
set idx [lsearch $O($P,children) $id]
if {$idx<0} {error "internal error!"}
set O($P,children) [lreplace $O($P,children) $idx $idx]
}
}
# Node method:
#
# $node addChild CHILD
# $node addChild -before BEFORE CHILD
#
proc ::hdom::nm_addChild {arrayname id args} {
upvar $arrayname O
if {[llength $args]==1} {
set newidx 0
set newchild [lindex $args 0]
} elseif {[llength $args]==3} {
if {[lindex $args 0] != "-before"} {
error "expected \"-before\" got \"[lindex $args 0]\""
}
set before [lindex $args 1]
set newidx [lsearch $O($id,children) $before]
if {$newidx < 0 } {error "$before is not a child of $id"}
set newchild [lindex $args 2]
}
# Unlink $newchild from its parent:
$newchild detach
# Link $newchild to new parent ($id):
set O($id,children) [linsert $O($id,children) $newidx $newchild]
set O($newchild,parent) $id
}
# Document method [$doc root]
#
proc ::hdom::dm_root {arrayname} {
upvar $arrayname O
return [create_node_command $arrayname $O(root)]
}
# Document method [$doc destroy]
#
proc ::hdom::dm_destroy {arrayname} {
upvar $arrayname O
proc $arrayname {method args} {}
foreach cmd $O(cmdlist) { proc $cmd {methods args} {} }
catch { uplevel [list array unset $arrayname ] }
}
# Document method [$doc parsenode]
#
proc ::hdom::dm_parsenode {arrayname html} {
upvar $arrayname O
set current ""
set O($current,tag) html
set O($current,children) [list]
set O($current,parent) ""
set O($current,detail) ""
set O($current,offset) 0
set O(current) $current
parsehtml $html [list parsehtml_cb $arrayname]
set res $O($current,children)
unset O($current,tag)
unset O($current,children)
unset O($current,parent)
unset O($current,detail)
unset O($current,offset)
foreach id $res { create_node_command $arrayname $id }
return $res
}
proc ::hdom::node_method {arrayname id method args} {
uplevel ::hdom::nm_$method $arrayname $id $args
}
proc ::hdom::document_method {arrayname method args} {
uplevel ::hdom::dm_$method $arrayname $args
}
# Return the name of the command for node $id, part of document $arrayname.
#
proc ::hdom::create_node_command {arrayname id} {
upvar $arrayname O
if { [llength [info commands $id]]==0} {
proc $id {method args} [subst -nocommands {
uplevel ::hdom::node_method $arrayname $id [set method] [set args]
}]
lappend O(cmdlist) $id
}
return $id
}
# Parse the html document passed as the first argument.
#
proc ::hdom::parse {html} {
set doc [nextDocId]
variable $doc
upvar 0 $doc O
set root [nextNodeId]
# Add the root node to the tree.
#
set O($root,tag) html
set O($root,detail) [list]
set O($root,children) [list]
set O($root,parent) ""
# Setup the other state data for the parse.
#
set O(current) $root
set O(root) $root
set O(cmdlist) [list]
parsehtml $html [list parsehtml_cb O]
# Create the document object command.
#
proc $doc {method args} [subst -nocommands {
uplevel ::hdom::document_method $doc [set method] [set args]
}]
return $doc
}