htmltools

Various command line tools to transform HTML documents
git clone git://git.entf.net/htmltools
Log | Files | Refs | README | LICENSE

commit bce8ffedc776db66a88a6179a785834793bcf984
Author: Lukas Henkel <lh@entf.net>
Date:   Fri, 22 Mar 2019 21:00:29 +0100

Initial commit

Diffstat:
Ago.mod | 6++++++
Ago.sum | 8++++++++
Ahtmlremove/main.go | 34++++++++++++++++++++++++++++++++++
Ahtmltotext/htmltotext.1.scd | 21+++++++++++++++++++++
Ahtmltotext/main.go | 26++++++++++++++++++++++++++
Ahtmlunwrap/main.go | 42++++++++++++++++++++++++++++++++++++++++++
Ashared/shared.go | 48++++++++++++++++++++++++++++++++++++++++++++++++
7 files changed, 185 insertions(+), 0 deletions(-)

diff --git a/go.mod b/go.mod @@ -0,0 +1,6 @@ +module entf.net/htmltools + +require ( + github.com/andybalholm/cascadia v1.0.0 + golang.org/x/net v0.0.0-20190320064053-1272bf9dcd53 +) diff --git a/go.sum b/go.sum @@ -0,0 +1,8 @@ +github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o= +github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190320064053-1272bf9dcd53 h1:kcXqo9vE6fsZY5X5Rd7R1l7fTgnWaDCVmln65REefiE= +golang.org/x/net v0.0.0-20190320064053-1272bf9dcd53/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= diff --git a/htmlremove/main.go b/htmlremove/main.go @@ -0,0 +1,34 @@ +package main // import "entf.net/htmltools/htmlremove" + +import ( + "fmt" + "os" + + "github.com/andybalholm/cascadia" + "golang.org/x/net/html" + + "entf.net/htmltools/shared" +) + +func main() { + args := os.Args[1:] + if len(args) == 0 { + fmt.Println("usage: htmlremove SELECTOR [FILES...]") + os.Exit(1) + } + sel, err := cascadia.Compile(args[0]) + if err != nil { + fmt.Fprintf(os.Stderr, "selector invalid: %v\n", err) + os.Exit(1) + } + shared.Main(args[1:], func(doc *html.Node) { + parse(sel, doc) + }) +} + +func parse(sel cascadia.Selector, doc *html.Node) { + for _, n := range sel.MatchAll(doc) { + n.Parent.RemoveChild(n) + } + html.Render(os.Stdout, doc) +} diff --git a/htmltotext/htmltotext.1.scd b/htmltotext/htmltotext.1.scd @@ -0,0 +1,21 @@ +HTMLTOTEXT(1) + +# NAME + +htmltotext - extract all text from an HTML document + +# SYNOPSIS + +*htmltotext* [_FILE_]... + +# DESCRIPTION + +Reads each file in sequence and prints all text without the HTML tags to +standard output. If no FILE is given or FILE is -, read standard input. + +If any FILE cannot be processed, a message prefixed with the FILE name will be +written to standard error. + +# AUTHOR + +Lukas Henkel <lh@entf.net> diff --git a/htmltotext/main.go b/htmltotext/main.go @@ -0,0 +1,26 @@ +package main // import "entf.net/htmltools/htmltotext" + +import ( + "fmt" + "os" + "strings" + + "golang.org/x/net/html" + + "entf.net/htmltools/shared" +) + +func main() { + shared.Main(os.Args[1:], visit) +} + +func visit(n *html.Node) { + if n.Type == html.TextNode { + if t := strings.TrimSpace(n.Data); t != "" { + fmt.Println(t) + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + visit(c) + } +} diff --git a/htmlunwrap/main.go b/htmlunwrap/main.go @@ -0,0 +1,42 @@ +package main // import "entf.net/htmltools/htmlunwrap" + +import ( + "fmt" + "os" + + "github.com/andybalholm/cascadia" + "golang.org/x/net/html" + + "entf.net/htmltools/shared" +) + +func main() { + args := os.Args[1:] + if len(args) == 0 { + fmt.Println("usage: htmlremove SELECTOR [FILES...]") + os.Exit(1) + } + sel, err := cascadia.Compile(args[0]) + if err != nil { + fmt.Fprintf(os.Stderr, "selector invalid: %v\n", err) + os.Exit(1) + } + shared.Main(args[1:], func(doc *html.Node) { + unwrap(sel, doc) + }) +} + +func unwrap(sel cascadia.Selector, doc *html.Node) { + for _, n := range sel.MatchAll(doc) { + cs := make([]*html.Node, 0) + for c := n.FirstChild; c != nil; c = c.NextSibling { + cs = append(cs, c) + } + for _, c := range cs { + n.RemoveChild(c) + n.Parent.InsertBefore(c, n) + } + n.Parent.RemoveChild(n) + } + html.Render(os.Stdout, doc) +} diff --git a/shared/shared.go b/shared/shared.go @@ -0,0 +1,48 @@ +package shared + +import ( + "fmt" + "io" + "os" + + "golang.org/x/net/html" +) + +var currentFile string + +func readerFromFile(file string) (f io.Reader, err error) { + if file == "-" { + currentFile = "[stdin]" + f = os.Stdin + } else { + currentFile = file + f, err = os.Open(file) + if err != nil { + return + } + } + return +} + +func LogErr(err error) { + fmt.Fprintf(os.Stderr, "%s: %v\n", currentFile, err) +} + +func Main(args []string, handleFunc func(*html.Node)) { + if len(args) == 0 { + args = append(args, "-") + } + for _, a := range args { + f, err := readerFromFile(a) + if err != nil { + LogErr(err) + continue + } + doc, err := html.Parse(f) + if err != nil { + LogErr(err) + return + } + handleFunc(doc) + } +}