commit bce8ffedc776db66a88a6179a785834793bcf984
Author: Lukas Henkel <lh@entf.net>
Date: Fri, 22 Mar 2019 21:00:29 +0100
Initial commit
Diffstat:
7 files changed, 185 insertions(+), 0 deletions(-)
diff --git a/go.mod b/go.mod
@@ -0,0 +1,6 @@
+module entf.net/htmltools
+
+require (
+ github.com/andybalholm/cascadia v1.0.0
+ golang.org/x/net v0.0.0-20190320064053-1272bf9dcd53
+)
diff --git a/go.sum b/go.sum
@@ -0,0 +1,8 @@
+github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o=
+github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190320064053-1272bf9dcd53 h1:kcXqo9vE6fsZY5X5Rd7R1l7fTgnWaDCVmln65REefiE=
+golang.org/x/net v0.0.0-20190320064053-1272bf9dcd53/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
diff --git a/htmlremove/main.go b/htmlremove/main.go
@@ -0,0 +1,34 @@
+package main // import "entf.net/htmltools/htmlremove"
+
+import (
+ "fmt"
+ "os"
+
+ "github.com/andybalholm/cascadia"
+ "golang.org/x/net/html"
+
+ "entf.net/htmltools/shared"
+)
+
+func main() {
+ args := os.Args[1:]
+ if len(args) == 0 {
+ fmt.Println("usage: htmlremove SELECTOR [FILES...]")
+ os.Exit(1)
+ }
+ sel, err := cascadia.Compile(args[0])
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "selector invalid: %v\n", err)
+ os.Exit(1)
+ }
+ shared.Main(args[1:], func(doc *html.Node) {
+ parse(sel, doc)
+ })
+}
+
+func parse(sel cascadia.Selector, doc *html.Node) {
+ for _, n := range sel.MatchAll(doc) {
+ n.Parent.RemoveChild(n)
+ }
+ html.Render(os.Stdout, doc)
+}
diff --git a/htmltotext/htmltotext.1.scd b/htmltotext/htmltotext.1.scd
@@ -0,0 +1,21 @@
+HTMLTOTEXT(1)
+
+# NAME
+
+htmltotext - extract all text from an HTML document
+
+# SYNOPSIS
+
+*htmltotext* [_FILE_]...
+
+# DESCRIPTION
+
+Reads each file in sequence and prints all text without the HTML tags to
+standard output. If no FILE is given or FILE is -, read standard input.
+
+If any FILE cannot be processed, a message prefixed with the FILE name will be
+written to standard error.
+
+# AUTHOR
+
+Lukas Henkel <lh@entf.net>
diff --git a/htmltotext/main.go b/htmltotext/main.go
@@ -0,0 +1,26 @@
+package main // import "entf.net/htmltools/htmltotext"
+
+import (
+ "fmt"
+ "os"
+ "strings"
+
+ "golang.org/x/net/html"
+
+ "entf.net/htmltools/shared"
+)
+
+func main() {
+ shared.Main(os.Args[1:], visit)
+}
+
+func visit(n *html.Node) {
+ if n.Type == html.TextNode {
+ if t := strings.TrimSpace(n.Data); t != "" {
+ fmt.Println(t)
+ }
+ }
+ for c := n.FirstChild; c != nil; c = c.NextSibling {
+ visit(c)
+ }
+}
diff --git a/htmlunwrap/main.go b/htmlunwrap/main.go
@@ -0,0 +1,42 @@
+package main // import "entf.net/htmltools/htmlunwrap"
+
+import (
+ "fmt"
+ "os"
+
+ "github.com/andybalholm/cascadia"
+ "golang.org/x/net/html"
+
+ "entf.net/htmltools/shared"
+)
+
+func main() {
+ args := os.Args[1:]
+ if len(args) == 0 {
+ fmt.Println("usage: htmlremove SELECTOR [FILES...]")
+ os.Exit(1)
+ }
+ sel, err := cascadia.Compile(args[0])
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "selector invalid: %v\n", err)
+ os.Exit(1)
+ }
+ shared.Main(args[1:], func(doc *html.Node) {
+ unwrap(sel, doc)
+ })
+}
+
+func unwrap(sel cascadia.Selector, doc *html.Node) {
+ for _, n := range sel.MatchAll(doc) {
+ cs := make([]*html.Node, 0)
+ for c := n.FirstChild; c != nil; c = c.NextSibling {
+ cs = append(cs, c)
+ }
+ for _, c := range cs {
+ n.RemoveChild(c)
+ n.Parent.InsertBefore(c, n)
+ }
+ n.Parent.RemoveChild(n)
+ }
+ html.Render(os.Stdout, doc)
+}
diff --git a/shared/shared.go b/shared/shared.go
@@ -0,0 +1,48 @@
+package shared
+
+import (
+ "fmt"
+ "io"
+ "os"
+
+ "golang.org/x/net/html"
+)
+
+var currentFile string
+
+func readerFromFile(file string) (f io.Reader, err error) {
+ if file == "-" {
+ currentFile = "[stdin]"
+ f = os.Stdin
+ } else {
+ currentFile = file
+ f, err = os.Open(file)
+ if err != nil {
+ return
+ }
+ }
+ return
+}
+
+func LogErr(err error) {
+ fmt.Fprintf(os.Stderr, "%s: %v\n", currentFile, err)
+}
+
+func Main(args []string, handleFunc func(*html.Node)) {
+ if len(args) == 0 {
+ args = append(args, "-")
+ }
+ for _, a := range args {
+ f, err := readerFromFile(a)
+ if err != nil {
+ LogErr(err)
+ continue
+ }
+ doc, err := html.Parse(f)
+ if err != nil {
+ LogErr(err)
+ return
+ }
+ handleFunc(doc)
+ }
+}