172 lines
3.6 KiB
Go
172 lines
3.6 KiB
Go
// Copyright 2024 Matthew Rich <matthewrich.conf@gmail.com>. All rights reserved.
|
|
|
|
package main
|
|
|
|
import (
|
|
"io"
|
|
_ "io/ioutil"
|
|
"fmt"
|
|
"strings"
|
|
"regexp"
|
|
"tagger/internal/models"
|
|
"tagger/internal/client"
|
|
"flag"
|
|
"net/http"
|
|
"golang.org/x/net/html"
|
|
"container/list"
|
|
_ "unicode"
|
|
"net/url"
|
|
)
|
|
|
|
type htmlNode html.Node
|
|
|
|
func metaKeywords(n *htmlNode) []string {
|
|
var name, content string
|
|
re := regexp.MustCompile(`[^-,\w\s]+`)
|
|
fmt.Printf("meta ")
|
|
for _, attribute := range n.Attr {
|
|
fmt.Printf(" %s=%s", attribute.Key, attribute.Val)
|
|
if attribute.Key == "name" {
|
|
if attribute.Val != "keywords" && attribute.Val != "title" && attribute.Val != "description" {
|
|
fmt.Printf("\n")
|
|
return []string{}
|
|
} else {
|
|
name = attribute.Val
|
|
}
|
|
}
|
|
if attribute.Key == "content" {
|
|
content,_ = url.PathUnescape(attribute.Val)
|
|
}
|
|
}
|
|
fmt.Printf("\n")
|
|
if name != "" {
|
|
var terms []string
|
|
if name == "keywords" {
|
|
terms = strings.Split(re.ReplaceAllString(content, ""), ",")
|
|
} else {
|
|
terms = []string{strings.ReplaceAll(re.ReplaceAllString(content, ""), ",", "")}
|
|
}
|
|
for i,t := range(terms) {
|
|
terms[i] = strings.ToLower(strings.ReplaceAll(strings.TrimSpace(t), " ", "-"))
|
|
}
|
|
return terms
|
|
//return strings.FieldsFunc(content, func(r rune) bool { return ! ( unicode.IsLetter(r) || unicode.IsNumber(r) || r == '-' ) })
|
|
}
|
|
return []string{}
|
|
}
|
|
|
|
|
|
func (h *htmlNode) FindAll(tagName string) []*htmlNode {
|
|
if h == nil {
|
|
return nil
|
|
}
|
|
return h.FindNodes(tagName, false)
|
|
}
|
|
|
|
func (h *htmlNode) Find(tagName string) *htmlNode {
|
|
if h == nil {
|
|
return nil
|
|
}
|
|
results := h.FindNodes(tagName, true)
|
|
if len(results) > 0 {
|
|
return results[0]
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (h *htmlNode) FindNodes(tagName string, first bool) []*htmlNode {
|
|
if h == nil {
|
|
return nil
|
|
}
|
|
|
|
n := (*html.Node)(h)
|
|
q := list.New()
|
|
|
|
var results []*htmlNode
|
|
|
|
q.PushBack(n)
|
|
|
|
for q.Len() > 0 {
|
|
v := (*html.Node)(q.Remove(q.Front()).(*html.Node))
|
|
if v.Type == html.ElementNode && v.Data == tagName {
|
|
results = append(results, (*htmlNode)(v))
|
|
if first {
|
|
break
|
|
}
|
|
}
|
|
|
|
for c := v.FirstChild; c != nil; c = c.NextSibling {
|
|
q.PushBack(c)
|
|
}
|
|
}
|
|
return results
|
|
}
|
|
|
|
func extractTagsMetaDataFromUrl(resource io.ReadCloser) ([]string, error) {
|
|
var results []string
|
|
doc,e := html.Parse(resource)
|
|
if e != nil {
|
|
panic(e)
|
|
}
|
|
|
|
for _,v := range (*htmlNode)(doc).Find("head").FindAll("meta") {
|
|
results = append(results, metaKeywords(v)...)
|
|
}
|
|
return results, nil
|
|
}
|
|
|
|
func main() {
|
|
|
|
tag := flag.String("tag", "tag-name", "Tag name")
|
|
//resource := flag.String("resource", "resource URL", "Resource URL")
|
|
|
|
endpoint := flag.String("endpoint", "http://localhost:8080/api/v1", "API endpoint URL")
|
|
|
|
extractUrl := flag.String("extract", "", "Extract tags from resource URL")
|
|
|
|
flag.Parse()
|
|
|
|
fmt.Printf("%#v\n", extractUrl)
|
|
|
|
|
|
cli := client.New(*endpoint)
|
|
|
|
if e := cli.Ping(); e != nil {
|
|
panic(e)
|
|
}
|
|
|
|
if len(*extractUrl) > 0 {
|
|
s,e := http.Get(*extractUrl)
|
|
if e != nil {
|
|
panic(e)
|
|
}
|
|
terms, e := extractTagsMetaDataFromUrl(s.Body)
|
|
// filter terms
|
|
|
|
fmt.Printf("%s\n", terms)
|
|
|
|
for _,t := range(terms) {
|
|
cli.AddTagResource(t, *extractUrl)
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
requestUrl := fmt.Sprintf("%s/tags/%s", *endpoint, *tag)
|
|
r,e := http.Get(requestUrl)
|
|
|
|
if e != nil {
|
|
panic(e)
|
|
}
|
|
|
|
defer r.Body.Close()
|
|
|
|
tagModel, e := models.NewTagFromJson(r.Body)
|
|
|
|
if e != nil {
|
|
//http.Error(w, err.Error(), http.StatusBadRequest)
|
|
}
|
|
|
|
fmt.Printf("%#v\n", tagModel)
|
|
}
|