htmltools

Various command line tools to transform HTML documents
git clone git://git.entf.net/htmltools
Log | Files | Refs | README | LICENSE

htmltools.go (3081B)


      1 package htmltools
      2 
      3 import (
      4 	"errors"
      5 	"fmt"
      6 	"strings"
      7 
      8 	"golang.org/x/net/html"
      9 )
     10 
     11 var (
     12 	ErrNodeIsNotADocumentNode = errors.New("Not a document node")
     13 	ErrNodeHasNoParent        = errors.New("Node has no parent")
     14 )
     15 
     16 type NodeMatchFunc func(*html.Node) bool
     17 
     18 // Gets the body from an HTML document node.
     19 func Body(doc *html.Node) (*html.Node, error) {
     20 	if doc.Type != html.DocumentNode {
     21 		return nil, ErrNodeIsNotADocumentNode
     22 	}
     23 	var htmln *html.Node
     24 	for n := doc.FirstChild; n != nil; n = n.NextSibling {
     25 		if n.Type == html.ElementNode && strings.ToLower(n.Data) == "html" {
     26 			htmln = n
     27 			break
     28 		}
     29 	}
     30 	if htmln == nil {
     31 		return nil, nil
     32 	}
     33 	var body *html.Node
     34 	for n := htmln.FirstChild; n != nil; n = n.NextSibling {
     35 		if strings.ToLower(n.Data) == "body" {
     36 			body = n
     37 			break
     38 		}
     39 	}
     40 	return body, nil
     41 }
     42 
     43 // Gets all direct children.
     44 func Children(node *html.Node) []*html.Node {
     45 	nodes := make([]*html.Node, 0)
     46 	for n := node.FirstChild; n != nil; n = n.NextSibling {
     47 		nodes = append(nodes, n)
     48 	}
     49 	return nodes
     50 }
     51 
     52 func findRecursive(node *html.Node, nodeFunc func(*html.Node) bool, ch chan<- *html.Node) {
     53 	if nodeFunc == nil || nodeFunc(node) {
     54 		ch <- node
     55 	}
     56 	for _, c := range Children(node) {
     57 		findRecursive(c, nodeFunc, ch)
     58 	}
     59 }
     60 
     61 // Returns a channel providing all nodes that match nodeFunc recursively through
     62 // the whole document. If nodeFunc is `nil`, all nodes match.
     63 func FindRecursive(doc *html.Node, nodeFunc NodeMatchFunc) <-chan *html.Node {
     64 	ch := make(chan *html.Node)
     65 	go func() {
     66 		findRecursive(doc, nodeFunc, ch)
     67 		close(ch)
     68 	}()
     69 	return ch
     70 }
     71 
     72 // Returns all attribite values specified in attrs for nodes.
     73 func Attr(attrs []string, nodes ...*html.Node) ([][]string, error) {
     74 	for i, attr := range attrs {
     75 		attrs[i] = strings.ToLower(attr)
     76 	}
     77 	results := make([][]string, 0)
     78 	for _, n := range nodes {
     79 		if n.Type != html.ElementNode {
     80 			continue
     81 		}
     82 		list := make([]string, len(attrs))
     83 		var any bool
     84 		for i, attrn := range attrs {
     85 			for _, attr := range n.Attr {
     86 				if strings.ToLower(attr.Key) == attrn {
     87 					any = true
     88 					list[i] = attr.Val
     89 				}
     90 			}
     91 		}
     92 		if any {
     93 			results = append(results, list)
     94 		}
     95 	}
     96 	return results, nil
     97 }
     98 
     99 // Indents all headings by a certain level.
    100 func IndentHeadings(level int, nodes ...*html.Node) error {
    101 	for _, n := range nodes {
    102 		switch strings.ToLower(n.Data) {
    103 		case "h1", "h2", "h3", "h4", "h5", "h6":
    104 		default:
    105 			continue
    106 		}
    107 		l := int(n.Data[1]) - 48 //HACK: ASCII to number
    108 		l += level
    109 		if l > 6 {
    110 			l = 6
    111 		} else if l < 1 {
    112 			l = 1
    113 		}
    114 		n.Data = fmt.Sprintf("h%d", l)
    115 	}
    116 	return nil
    117 }
    118 
    119 // Removes node from parent and replaces it by it's children.
    120 func Unwrap(node *html.Node) error {
    121 	if node.Parent == nil {
    122 		return ErrNodeHasNoParent
    123 	}
    124 	for _, c := range Children(node) {
    125 		node.RemoveChild(c)
    126 		node.Parent.InsertBefore(c, node)
    127 	}
    128 	node.Parent.RemoveChild(node)
    129 	return nil
    130 }
    131 
    132 // Creates a NodeMatchFunc, matching a certain NodeType
    133 func MatchNodeTypeFunc(nodeType html.NodeType) NodeMatchFunc {
    134 	return func(node *html.Node) bool {
    135 		return node.Type == nodeType
    136 	}
    137 }