Skip to content

Commit ccc1891

Browse files
authored
Merge pull request #45 from beclab/feat/templates
add xhs
2 parents b8f0c89 + c737fe8 commit ccc1891

File tree

3 files changed

+101
-2
lines changed

3 files changed

+101
-2
lines changed

processor/processor.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ func ArticleContentExtractor(rawContent, entryUrl, feedUrl, rules string) (strin
100100
return content, pureContent
101101
}
102102

103-
func NonRawContentDownloadQueryInArticle(url string) (string, string, string) {
103+
func NonMediaDownloadQueryInArticle(url string) (string, string, string) {
104104
funcs := reflect.ValueOf(&templates.Template{})
105105
_, mediaRule := getNonRawContentDownloadScraperRules(url)
106106
if mediaRule != "" {
@@ -116,7 +116,7 @@ func NonRawContentDownloadQueryInArticle(url string) (string, string, string) {
116116
return "", "", ""
117117
}
118118

119-
func ExceptYTdlpDownloadQueryInArticle(rawContent, url string) (string, string) {
119+
func MediaDownloadQueryInArticle(rawContent, url string) (string, string) {
120120
templateRawData := strings.NewReader(rawContent)
121121
doc, _ := goquery.NewDocumentFromReader(templateRawData)
122122
funcs := reflect.ValueOf(&templates.Template{})

processor/rules.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ var contentTemplatePredefinedRules = map[string]string{
184184
"fandom.com": "FandomScrapContent",
185185
"v2ex.com": "V2exScrapContent",
186186
"okjike.com": "OKjikeScrapContent",
187+
"xhslink.com": "XhsScrapContent",
187188
//"screencrush.com": "ScreencrushScrapContent",
188189
/*"espn.com": "EspnScrapContent",
189190
"nbcsports.com": "NBCSportsScrapContent",
@@ -274,6 +275,7 @@ var metadataTemplatePredefinedRules = map[string]string{
274275
"v2ex.com": "V2exScrapMetaData",
275276
"okjike.com": "OKjikeScrapMetaData",
276277
"xiaoyuzhoufm.com": "XiaoyuzhouScrapMetaData",
278+
"xhslink.com": "XhsScrapMetaData",
277279
}
278280

279281
var publishedAtTimeStampTemplatePredefinedRules = map[string]string{
@@ -326,6 +328,7 @@ var publishedAtTimeStampTemplatePredefinedRules = map[string]string{
326328
"reddit.com": "RedditPublishedAtTimeFromScriptMetadata",
327329
"douban.com": "DoubanPublishedAtTimeFromScriptMetadata",
328330
"okjike.com": "OKjikePublishedAtTimeFromScriptMetadata",
331+
"xhslink.com": "XhsPublishedAtTimeFromScriptMetadata",
329332
}
330333

331334
func getPredefinedPublishedAtTimestampTemplateRules(websiteURL string) (string, string) {

templates/xhs.go

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
package templates
2+
3+
import (
4+
"encoding/json"
5+
"fmt"
6+
"log"
7+
"strings"
8+
9+
"github.com/PuerkitoBio/goquery"
10+
)
11+
12+
func (t *Template) XhsScrapMetaData(document *goquery.Document) (string, string) {
13+
author := ""
14+
published_at := ""
15+
16+
var jsonData string
17+
document.Find("script").Each(func(i int, s *goquery.Selection) {
18+
scriptContent := s.Text()
19+
if strings.Contains(scriptContent, "window.__INITIAL_STATE__") {
20+
parts := strings.SplitN(scriptContent, "=", 2)
21+
if len(parts) == 2 {
22+
jsonData = strings.TrimSpace(parts[1])
23+
return
24+
}
25+
}
26+
})
27+
jsonData = strings.ReplaceAll(jsonData, "undefined", "null")
28+
var result map[string]interface{}
29+
if err := json.Unmarshal([]byte(jsonData), &result); err != nil {
30+
log.Printf("json unmarshal %v", err)
31+
}
32+
if note, ok := result["note"]; ok {
33+
if noteDetailMap, ok := note.(map[string]interface{})["noteDetailMap"]; ok {
34+
for key, item := range noteDetailMap.(map[string]interface{}) {
35+
if itemNote, ok := item.(map[string]interface{})["note"]; ok {
36+
if user, ok := itemNote.(map[string]interface{})["user"]; ok {
37+
if nickname, ok := user.(map[string]interface{})["nickname"]; ok {
38+
author = nickname.(string)
39+
}
40+
}
41+
}
42+
fmt.Printf("xhs Key: %s\n", key)
43+
}
44+
}
45+
}
46+
return author, published_at
47+
}
48+
49+
func (t *Template) XhsPublishedAtTimeFromScriptMetadata(document *goquery.Document) int64 {
50+
var publishedAt int64 = 0
51+
var jsonData string
52+
document.Find("script").Each(func(i int, s *goquery.Selection) {
53+
scriptContent := s.Text()
54+
if strings.Contains(scriptContent, "window.__INITIAL_STATE__") {
55+
parts := strings.SplitN(scriptContent, "=", 2)
56+
if len(parts) == 2 {
57+
jsonData = strings.TrimSpace(parts[1])
58+
return
59+
}
60+
}
61+
})
62+
jsonData = strings.ReplaceAll(jsonData, "undefined", "null")
63+
var result map[string]interface{}
64+
if err := json.Unmarshal([]byte(jsonData), &result); err != nil {
65+
log.Printf("json unmarshal %v", err)
66+
}
67+
if note, ok := result["note"]; ok {
68+
if noteDetailMap, ok := note.(map[string]interface{})["noteDetailMap"]; ok {
69+
for key, item := range noteDetailMap.(map[string]interface{}) {
70+
if itemNote, ok := item.(map[string]interface{})["note"]; ok {
71+
if lastUpdateTime, ok := itemNote.(map[string]interface{})["lastUpdateTime"]; ok {
72+
publishedAt = int64(lastUpdateTime.(float64) / 1000)
73+
}
74+
}
75+
fmt.Printf("xhs Key: %s\n", key)
76+
}
77+
}
78+
}
79+
return publishedAt
80+
}
81+
82+
func (t *Template) XhsScrapContent(document *goquery.Document) string {
83+
contents := ""
84+
85+
document.Find("meta[name='og:image']").Each(func(i int, s *goquery.Selection) {
86+
if content, exists := s.Attr("content"); exists {
87+
contents = contents + "<img src='" + content + "' /> <br>"
88+
}
89+
})
90+
document.Find("span.note-text").Each(func(i int, s *goquery.Selection) {
91+
var content string
92+
content, _ = goquery.OuterHtml(s)
93+
contents += content
94+
})
95+
return contents
96+
}

0 commit comments

Comments
 (0)