htmltools

Various command line tools to transform HTML documents
git clone git://git.entf.net/htmltools
Log | Files | Refs | README | LICENSE

commit c4f4270f81fb5506346c1fbb474f1e5939634918
parent acb1b50c425833e1e92883c20c663f02b175a065
Author: Lukas Henkel <lh@entf.net>
Date:   Tue, 16 Feb 2021 20:43:32 +0100

More ideomatic project structure

Diffstat:
MMakefile | 37++++++++++++++++++++++---------------
Acmd/htmlattr/main.go | 58++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Acmd/htmlindentheadings/main.go | 51+++++++++++++++++++++++++++++++++++++++++++++++++++
Acmd/htmlremove/main.go | 34++++++++++++++++++++++++++++++++++
Acmd/htmlselect/main.go | 40++++++++++++++++++++++++++++++++++++++++
Acmd/htmltotext/main.go | 26++++++++++++++++++++++++++
Acmd/htmlunwrap/main.go | 42++++++++++++++++++++++++++++++++++++++++++
Rhtmlattr/htmlattr.1.scd -> doc/htmlattr.1.scd | 0
Rhtmlindentheadings/htmlindentheadings.1.scd -> doc/htmlindentheadings.1.scd | 0
Rhtmlremove/htmlremove.1.scd -> doc/htmlremove.1.scd | 0
Rhtmlselect/htmlselect.1.scd -> doc/htmlselect.1.scd | 0
Rhtmltotext/htmltotext.1.scd -> doc/htmltotext.1.scd | 0
Rhtmlunwrap/htmlunwrap.1.scd -> doc/htmlunwrap.1.scd | 0
Dhtmlattr/main.go | 58----------------------------------------------------------
Dhtmlindentheadings/main.go | 51---------------------------------------------------
Dhtmlremove/main.go | 34----------------------------------
Dhtmlselect/main.go | 40----------------------------------------
Dhtmltotext/main.go | 26--------------------------
Dhtmlunwrap/main.go | 42------------------------------------------
Ashared.go | 48++++++++++++++++++++++++++++++++++++++++++++++++
Dshared/shared.go | 48------------------------------------------------
21 files changed, 321 insertions(+), 314 deletions(-)

diff --git a/Makefile b/Makefile @@ -1,28 +1,35 @@ -TOOLS = htmlremove htmltotext htmlunwrap htmlselect htmlindentheadings htmlattr +VPATH = doc PREFIX = /usr/local -MANS = $(shell find . -name '*.scd' | sed s/\.scd//) -all: $(TOOLS) $(MANS) +TOOLS := \ + htmlattr \ + htmlindentheadings \ + htmlremove \ + htmlselect \ + htmltotext \ + htmlunwrap +DOCS := $(addsuffix .1, $(TOOLS)) -$(TOOLS): - mkdir -p bin - go build -o bin/$@ entf.net/htmltools/$@ +SRC := $(shell find . -name "*.go") + +all: $(TOOLS) $(DOCS) + +$(TOOLS): $(SRC) + go build entf.net/htmltools/cmd/$@ %.1: %.1.scd scdoc < $< > $@ install: all - mkdir -p "$(PREFIX)/bin" - cp $(addprefix bin/, $(TOOLS)) "$(PREFIX)/bin/" - mkdir -p "$(PREFIX)/share/man/man1" - cp $(MANS) "$(PREFIX)/share/man/man1/" + install -Dm755 $(TOOLS) -t "$(PREFIX)/bin/" + install -Dm644 $(DOCS) -t "$(PREFIX)/share/man/man1/" uninstall: -rm -- $(addprefix $(PREFIX)/bin/, $(TOOLS)) - -rm -- $(addprefix $(PREFIX)/share/man/man1/, $(notdir $(MANS))) + -rm -- $(addprefix $(PREFIX)/share/man/man1/, $(DOCS)) clean: - -rm -r bin/ - -rm -- $(MANS) + -rm -- $(TOOLS) + -rm -- $(DOCS) -.PHONY: all $(TOOLS) install uninstall clean- \ No newline at end of file +.PHONY: all install uninstall clean+ \ No newline at end of file diff --git a/cmd/htmlattr/main.go b/cmd/htmlattr/main.go @@ -0,0 +1,58 @@ +package main + +import ( + "flag" + "fmt" + "os" + "strings" + + "entf.net/htmltools" + "golang.org/x/net/html" +) + +func main() { + var fs string + flag.StringVar(&fs, "fs", ",", "field seperator") + flag.Parse() + args := flag.Args() + if len(args) == 0 { + fmt.Println("usage: htmlattr [-fs FIELD_SEPERATOR] ATTRIBUTES [FILES...]") + os.Exit(1) + } + attrs := strings.Split(args[0], fs) + for i, attr := range attrs { + attrs[i] = strings.ToLower(attr) + } + htmltools.Main(args[1:], func(doc *html.Node) { + var body *html.Node + for n := doc.FirstChild.FirstChild; n != nil; n = n.NextSibling { + if strings.ToLower(n.Data) == "body" { + body = n + break + } + } + if body == nil { + fmt.Fprintln(os.Stderr, "document does not contain a body") + os.Exit(1) + } + for n := body.FirstChild; n != nil; n = n.NextSibling { + if n.Type != html.ElementNode { + continue + } + list := make([]string, len(attrs)) + var any bool + for i, attrn := range attrs { + for _, attr := range n.Attr { + if strings.ToLower(attr.Key) == attrn { + any = true + list[i] = attr.Val + } + } + } + line := strings.Join(list, fs) + if any { + fmt.Println(line) + } + } + }) +} diff --git a/cmd/htmlindentheadings/main.go b/cmd/htmlindentheadings/main.go @@ -0,0 +1,51 @@ +package main + +import ( + "fmt" + "os" + "strconv" + + "golang.org/x/net/html" + + "entf.net/htmltools" +) + +const usage = "usage: htmlindentheadings INDENT_LEVELS [FILES...]" + +func main() { + args := os.Args[1:] + if len(args) == 0 { + fmt.Println(usage) + os.Exit(1) + } + lvls, err := strconv.Atoi(args[0]) + if err != nil { + fmt.Println(usage) + os.Exit(1) + } + htmltools.Main(args[1:], func(doc *html.Node) { + visit(lvls, doc) + html.Render(os.Stdout, doc) + }) +} + +func indent(lvls int, tag string) string { + l := int(tag[1]) - 48 + l += lvls + if l > 6 { + l = 6 + } + return fmt.Sprintf("h%d", l) +} + +func visit(lvls int, n *html.Node) { + if n.Type == html.ElementNode { + switch n.Data { + case "h1", "h2", "h3", "h4", "h5", "h6": + n.Data = indent(lvls, n.Data) + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + visit(lvls, c) + } +} diff --git a/cmd/htmlremove/main.go b/cmd/htmlremove/main.go @@ -0,0 +1,34 @@ +package main + +import ( + "fmt" + "os" + + "github.com/andybalholm/cascadia" + "golang.org/x/net/html" + + "entf.net/htmltools" +) + +func main() { + args := os.Args[1:] + if len(args) == 0 { + fmt.Println("usage: htmlremove SELECTOR [FILES...]") + os.Exit(1) + } + sel, err := cascadia.Compile(args[0]) + if err != nil { + fmt.Fprintf(os.Stderr, "selector invalid: %v\n", err) + os.Exit(1) + } + htmltools.Main(args[1:], func(doc *html.Node) { + remove(sel, doc) + }) +} + +func remove(sel cascadia.Selector, doc *html.Node) { + for _, n := range sel.MatchAll(doc) { + n.Parent.RemoveChild(n) + } + html.Render(os.Stdout, doc) +} diff --git a/cmd/htmlselect/main.go b/cmd/htmlselect/main.go @@ -0,0 +1,40 @@ +package main + +import ( + "bytes" + "fmt" + "os" + "strings" + + "github.com/andybalholm/cascadia" + "golang.org/x/net/html" + + "entf.net/htmltools" +) + +func main() { + args := os.Args[1:] + if len(args) == 0 { + fmt.Println("usage: htmlselect SELECTOR [FILES...]") + os.Exit(1) + } + sel, err := cascadia.Compile(args[0]) + if err != nil { + fmt.Fprintf(os.Stderr, "selector invalid: %v\n", err) + os.Exit(1) + } + htmltools.Main(args[1:], func(doc *html.Node) { + dosel(sel, doc) + }) +} + +func dosel(sel cascadia.Selector, doc *html.Node) { + for _, n := range sel.MatchAll(doc) { + buf := &bytes.Buffer{} + html.Render(buf, n) + l := buf.String() + l = strings.ReplaceAll(l, "\n", " ") + l = strings.TrimSpace(l) + fmt.Println(l) + } +} diff --git a/cmd/htmltotext/main.go b/cmd/htmltotext/main.go @@ -0,0 +1,26 @@ +package main + +import ( + "fmt" + "os" + "strings" + + "golang.org/x/net/html" + + "entf.net/htmltools" +) + +func main() { + htmltools.Main(os.Args[1:], visit) +} + +func visit(n *html.Node) { + if n.Type == html.TextNode { + if t := strings.TrimSpace(n.Data); t != "" { + fmt.Println(t) + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + visit(c) + } +} diff --git a/cmd/htmlunwrap/main.go b/cmd/htmlunwrap/main.go @@ -0,0 +1,42 @@ +package main + +import ( + "fmt" + "os" + + "github.com/andybalholm/cascadia" + "golang.org/x/net/html" + + "entf.net/htmltools" +) + +func main() { + args := os.Args[1:] + if len(args) == 0 { + fmt.Println("usage: htmlremove SELECTOR [FILES...]") + os.Exit(1) + } + sel, err := cascadia.Compile(args[0]) + if err != nil { + fmt.Fprintf(os.Stderr, "selector invalid: %v\n", err) + os.Exit(1) + } + htmltools.Main(args[1:], func(doc *html.Node) { + unwrap(sel, doc) + }) +} + +func unwrap(sel cascadia.Selector, doc *html.Node) { + for _, n := range sel.MatchAll(doc) { + cs := make([]*html.Node, 0) + for c := n.FirstChild; c != nil; c = c.NextSibling { + cs = append(cs, c) + } + for _, c := range cs { + n.RemoveChild(c) + n.Parent.InsertBefore(c, n) + } + n.Parent.RemoveChild(n) + } + html.Render(os.Stdout, doc) +} diff --git a/htmlattr/htmlattr.1.scd b/doc/htmlattr.1.scd diff --git a/htmlindentheadings/htmlindentheadings.1.scd b/doc/htmlindentheadings.1.scd diff --git a/htmlremove/htmlremove.1.scd b/doc/htmlremove.1.scd diff --git a/htmlselect/htmlselect.1.scd b/doc/htmlselect.1.scd diff --git a/htmltotext/htmltotext.1.scd b/doc/htmltotext.1.scd diff --git a/htmlunwrap/htmlunwrap.1.scd b/doc/htmlunwrap.1.scd diff --git a/htmlattr/main.go b/htmlattr/main.go @@ -1,58 +0,0 @@ -package main // import "entf.net/htmltools/htmlattr" - -import ( - "flag" - "fmt" - "os" - "strings" - - "entf.net/htmltools/shared" - "golang.org/x/net/html" -) - -func main() { - var fs string - flag.StringVar(&fs, "fs", ",", "field seperator") - flag.Parse() - args := flag.Args() - if len(args) == 0 { - fmt.Println("usage: htmlattr [-fs FIELD_SEPERATOR] ATTRIBUTES [FILES...]") - os.Exit(1) - } - attrs := strings.Split(args[0], fs) - for i, attr := range attrs { - attrs[i] = strings.ToLower(attr) - } - shared.Main(args[1:], func(doc *html.Node) { - var body *html.Node - for n := doc.FirstChild.FirstChild; n != nil; n = n.NextSibling { - if strings.ToLower(n.Data) == "body" { - body = n - break - } - } - if body == nil { - fmt.Fprintln(os.Stderr, "document does not contain a body") - os.Exit(1) - } - for n := body.FirstChild; n != nil; n = n.NextSibling { - if n.Type != html.ElementNode { - continue - } - list := make([]string, len(attrs)) - var any bool - for i, attrn := range attrs { - for _, attr := range n.Attr { - if strings.ToLower(attr.Key) == attrn { - any = true - list[i] = attr.Val - } - } - } - line := strings.Join(list, fs) - if any { - fmt.Println(line) - } - } - }) -} diff --git a/htmlindentheadings/main.go b/htmlindentheadings/main.go @@ -1,51 +0,0 @@ -package main // import "entf.net/htmltools/htmlindentheadings" - -import ( - "fmt" - "os" - "strconv" - - "golang.org/x/net/html" - - "entf.net/htmltools/shared" -) - -const usage = "usage: htmlindentheadings INDENT_LEVELS [FILES...]" - -func main() { - args := os.Args[1:] - if len(args) == 0 { - fmt.Println(usage) - os.Exit(1) - } - lvls, err := strconv.Atoi(args[0]) - if err != nil { - fmt.Println(usage) - os.Exit(1) - } - shared.Main(args[1:], func(doc *html.Node) { - visit(lvls, doc) - html.Render(os.Stdout, doc) - }) -} - -func indent(lvls int, tag string) string { - l := int(tag[1]) - 48 - l += lvls - if l > 6 { - l = 6 - } - return fmt.Sprintf("h%d", l) -} - -func visit(lvls int, n *html.Node) { - if n.Type == html.ElementNode { - switch n.Data { - case "h1", "h2", "h3", "h4", "h5", "h6": - n.Data = indent(lvls, n.Data) - } - } - for c := n.FirstChild; c != nil; c = c.NextSibling { - visit(lvls, c) - } -} diff --git a/htmlremove/main.go b/htmlremove/main.go @@ -1,34 +0,0 @@ -package main // import "entf.net/htmltools/htmlremove" - -import ( - "fmt" - "os" - - "github.com/andybalholm/cascadia" - "golang.org/x/net/html" - - "entf.net/htmltools/shared" -) - -func main() { - args := os.Args[1:] - if len(args) == 0 { - fmt.Println("usage: htmlremove SELECTOR [FILES...]") - os.Exit(1) - } - sel, err := cascadia.Compile(args[0]) - if err != nil { - fmt.Fprintf(os.Stderr, "selector invalid: %v\n", err) - os.Exit(1) - } - shared.Main(args[1:], func(doc *html.Node) { - remove(sel, doc) - }) -} - -func remove(sel cascadia.Selector, doc *html.Node) { - for _, n := range sel.MatchAll(doc) { - n.Parent.RemoveChild(n) - } - html.Render(os.Stdout, doc) -} diff --git a/htmlselect/main.go b/htmlselect/main.go @@ -1,40 +0,0 @@ -package main // import "entf.net/htmltools/htmlselect" - -import ( - "bytes" - "fmt" - "os" - "strings" - - "github.com/andybalholm/cascadia" - "golang.org/x/net/html" - - "entf.net/htmltools/shared" -) - -func main() { - args := os.Args[1:] - if len(args) == 0 { - fmt.Println("usage: htmlselect SELECTOR [FILES...]") - os.Exit(1) - } - sel, err := cascadia.Compile(args[0]) - if err != nil { - fmt.Fprintf(os.Stderr, "selector invalid: %v\n", err) - os.Exit(1) - } - shared.Main(args[1:], func(doc *html.Node) { - dosel(sel, doc) - }) -} - -func dosel(sel cascadia.Selector, doc *html.Node) { - for _, n := range sel.MatchAll(doc) { - buf := &bytes.Buffer{} - html.Render(buf, n) - l := buf.String() - l = strings.ReplaceAll(l, "\n", " ") - l = strings.TrimSpace(l) - fmt.Println(l) - } -} diff --git a/htmltotext/main.go b/htmltotext/main.go @@ -1,26 +0,0 @@ -package main // import "entf.net/htmltools/htmltotext" - -import ( - "fmt" - "os" - "strings" - - "golang.org/x/net/html" - - "entf.net/htmltools/shared" -) - -func main() { - shared.Main(os.Args[1:], visit) -} - -func visit(n *html.Node) { - if n.Type == html.TextNode { - if t := strings.TrimSpace(n.Data); t != "" { - fmt.Println(t) - } - } - for c := n.FirstChild; c != nil; c = c.NextSibling { - visit(c) - } -} diff --git a/htmlunwrap/main.go b/htmlunwrap/main.go @@ -1,42 +0,0 @@ -package main // import "entf.net/htmltools/htmlunwrap" - -import ( - "fmt" - "os" - - "github.com/andybalholm/cascadia" - "golang.org/x/net/html" - - "entf.net/htmltools/shared" -) - -func main() { - args := os.Args[1:] - if len(args) == 0 { - fmt.Println("usage: htmlremove SELECTOR [FILES...]") - os.Exit(1) - } - sel, err := cascadia.Compile(args[0]) - if err != nil { - fmt.Fprintf(os.Stderr, "selector invalid: %v\n", err) - os.Exit(1) - } - shared.Main(args[1:], func(doc *html.Node) { - unwrap(sel, doc) - }) -} - -func unwrap(sel cascadia.Selector, doc *html.Node) { - for _, n := range sel.MatchAll(doc) { - cs := make([]*html.Node, 0) - for c := n.FirstChild; c != nil; c = c.NextSibling { - cs = append(cs, c) - } - for _, c := range cs { - n.RemoveChild(c) - n.Parent.InsertBefore(c, n) - } - n.Parent.RemoveChild(n) - } - html.Render(os.Stdout, doc) -} diff --git a/shared.go b/shared.go @@ -0,0 +1,48 @@ +package htmltools + +import ( + "fmt" + "io" + "os" + + "golang.org/x/net/html" +) + +var currentFile string + +func readerFromFile(file string) (f io.Reader, err error) { + if file == "-" { + currentFile = "[stdin]" + f = os.Stdin + } else { + currentFile = file + f, err = os.Open(file) + if err != nil { + return + } + } + return +} + +func LogErr(err error) { + fmt.Fprintf(os.Stderr, "%s: %v\n", currentFile, err) +} + +func Main(args []string, handleFunc func(*html.Node)) { + if len(args) == 0 { + args = append(args, "-") + } + for _, a := range args { + f, err := readerFromFile(a) + if err != nil { + LogErr(err) + continue + } + doc, err := html.Parse(f) + if err != nil { + LogErr(err) + return + } + handleFunc(doc) + } +} diff --git a/shared/shared.go b/shared/shared.go @@ -1,48 +0,0 @@ -package shared - -import ( - "fmt" - "io" - "os" - - "golang.org/x/net/html" -) - -var currentFile string - -func readerFromFile(file string) (f io.Reader, err error) { - if file == "-" { - currentFile = "[stdin]" - f = os.Stdin - } else { - currentFile = file - f, err = os.Open(file) - if err != nil { - return - } - } - return -} - -func LogErr(err error) { - fmt.Fprintf(os.Stderr, "%s: %v\n", currentFile, err) -} - -func Main(args []string, handleFunc func(*html.Node)) { - if len(args) == 0 { - args = append(args, "-") - } - for _, a := range args { - f, err := readerFromFile(a) - if err != nil { - LogErr(err) - continue - } - doc, err := html.Parse(f) - if err != nil { - LogErr(err) - return - } - handleFunc(doc) - } -}