Skip to content

Commit 72a0553

Browse files
authored
Merge pull request #47 from beclab/feat/refactor
feat: file type change
2 parents ee0ad64 + abd10cb commit 72a0553

File tree

17 files changed

+83
-32
lines changed

17 files changed

+83
-32
lines changed

processor/processor.go

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ import (
1414
"github.com/beclab/article-extractor/templates/postExtractor"
1515
)
1616

17-
// 得到content内容,主要在推荐算法爬取页面后解析正文内容
17+
// Obtain the content, primarily by parsing the main body after crawling the page with the recommendation algorithm
1818
func ArticleContentExtractor(rawContent, entryUrl string) (string, string) {
1919
entryDomain := domain(entryUrl)
2020
templateRawData := strings.NewReader(rawContent)
@@ -56,8 +56,8 @@ func ArticleContentExtractor(rawContent, entryUrl string) (string, string) {
5656
return content, pureContent
5757
}
5858

59-
// 根据url,不用正文内容获得下载信息
60-
// 对于ebook和pdf 通过url来解析,不需要爬取页面
59+
// Obtain download information based on the URL, without using the main content.
60+
// Parse eBook and PDF information directly from the URL, without crawling the page.
6161
func DownloadTypeQueryByUrl(url string) (string, string, string) {
6262
funcs := reflect.ValueOf(&templates.Template{})
6363
_, mediaRule := getDownloadTypeByUrlRules(url)
@@ -72,7 +72,7 @@ func DownloadTypeQueryByUrl(url string) (string, string, string) {
7272
return "", "", ""
7373
}
7474

75-
// 根据模版表获得正文,作者,发布时间,以及下载信息
75+
// Obtain the main content, author, publication date, and download information based on the template table.
7676
func MetaDataQueryByTemplate(entryUrl, rawContent string, doc *goquery.Document) (string, string, int64, string, string, string) {
7777
var content string
7878
var author string
@@ -109,8 +109,7 @@ func MetaDataQueryByTemplate(entryUrl, rawContent string, doc *goquery.Document)
109109
return content, author, publishedAt, mediaContent, downloadUrl, downloadType
110110
}
111111

112-
// 输入url,rawcontent
113-
// 输出entry的metadata
112+
// Input the URL and raw content, and output the metadata of the entry.
114113
func ArticleExtractor(rawContent, entryUrl string) (string, string, *time.Time, string, string, string, int64, string, string, string) {
115114
templateRawData := strings.NewReader(rawContent)
116115
doc, _ := goquery.NewDocumentFromReader(templateRawData)

processor/rules.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ var contentTemplatePredefinedRules = map[string]string{
144144
"zhihu.com": "ZhihuExtractorMetaInfo",
145145
}
146146

147-
var DownloadTypeUrlTemplatedRules = map[string]string{
147+
var downloadTypeUrlTemplatedRules = map[string]string{
148148
"manybooks.net": "ManyBooksDownloadType", //need cookies
149149
"standardebooks.org": "StandardebooksDownloadType",
150150
"z-library.gs": "ZLibraryDownloadType", //need cookies
@@ -179,7 +179,7 @@ func getContentPostExtractorTemplateRules(websiteURL string) string {
179179

180180
func getDownloadTypeByUrlRules(websiteURL string) (string, string) {
181181
urlDomain := domain(websiteURL)
182-
for domain, rules := range DownloadTypeUrlTemplatedRules {
182+
for domain, rules := range downloadTypeUrlTemplatedRules {
183183
if strings.Contains(urlDomain, domain) {
184184
return domain, rules
185185
}

templates/acfun.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,5 +36,5 @@ func (t *Template) ACFunExtractorMetaInfo(url string, document *goquery.Document
3636
author = s.Text()
3737
})
3838
publishedAt := acFunScrapPublishedAt(document)
39-
return content, author, publishedAt, "", url, "video"
39+
return content, author, publishedAt, "", url, VideoFileType
4040
}

templates/base.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"regexp"
77
"strconv"
88
"strings"
9+
"time"
910

1011
"github.com/PuerkitoBio/goquery"
1112
"github.com/beclab/article-extractor/readability"
@@ -19,6 +20,32 @@ type ExtractorFileInfo struct {
1920
FileType string
2021
}
2122

23+
const (
24+
ShanghaiTZ = "Asia/Shanghai"
25+
UrumqiTZ = "Asia/Urumqi"
26+
)
27+
28+
const (
29+
VideoFileType = "video"
30+
AudioFileType = "audio"
31+
PdfFileType = "pdf"
32+
EbookFileType = "ebook"
33+
)
34+
35+
func ParseLocationTimestamp(timeStr, layout string, location string) (int64, error) {
36+
loc, err := time.LoadLocation(location)
37+
if err != nil {
38+
return 0, err
39+
}
40+
41+
t, err := time.ParseInLocation(layout, timeStr, loc)
42+
if err != nil {
43+
return 0, err
44+
}
45+
46+
return t.Unix(), nil
47+
}
48+
2249
func GetArticleByDivClass(document *goquery.Document) string {
2350
content := ""
2451
document.Find("div.entry-content,div.content-entry,div.article-detail,div.entry,div.entry__content,div.article__content,div.articleContent").Each(func(i int, s *goquery.Selection) {

templates/bilibili.go

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import (
1010

1111
func bilibiliScrapContent(document *goquery.Document) string {
1212
contents := ""
13-
document.Find("span.desc-info-text").Each(func(i int, s *goquery.Selection) {
13+
document.Find("span.desc-info-text,div.opus-module-content").Each(func(i int, s *goquery.Selection) {
1414
var content string
1515
content, _ = goquery.OuterHtml(s)
1616
contents += content
@@ -36,8 +36,20 @@ func bilibiliScrapContent(document *goquery.Document) string {
3636
}
3737

3838
func (t *Template) BilibiliExtractorMetaInfo(url string, document *goquery.Document) (string, string, int64, string, string, string) {
39-
bvid := ""
39+
author := ""
40+
document.Find("div.fixed-author-header__author__name,div.opus-module-author__name").Each(func(i int, s *goquery.Selection) {
41+
author = strings.TrimSpace(s.Text())
42+
})
43+
var publishedAt int64 = 0
44+
document.Find("div.opus-module-author__pub__text").Each(func(i int, s *goquery.Selection) {
45+
publishTimes := s.Text()
46+
layout := "2006年01月02日 15:04"
47+
publishTimes = strings.TrimPrefix(publishTimes, "编辑于 ")
48+
publishedAt, _ = ParseLocationTimestamp(publishTimes, layout, ShanghaiTZ)
49+
})
4050
content := bilibiliScrapContent(document)
51+
52+
bvid := ""
4153
document.Find("meta[itemprop=url]").Each(func(i int, s *goquery.Selection) {
4254
if content, exists := s.Attr("content"); exists {
4355
videoPattern := `video/(\w+)`
@@ -51,13 +63,14 @@ func (t *Template) BilibiliExtractorMetaInfo(url string, document *goquery.Docum
5163
if bvid != "" {
5264
embeddingUrl := "https://www.bilibili.com/blackboard/html5mobileplayer.html?bvid=" + bvid + "&high_quality=1&autoplay=0"
5365
contents := "<iframe width='910' height='668' src='" + embeddingUrl + "' border='0' scrolling='no' border='0 frameborder='no' framespacing='0' allowfullscreen='true' referrerpolicy='no-referrer'></iframe>"
54-
return content, "", 0, contents, url, "video"
66+
return content, author, publishedAt, contents, url, VideoFileType
5567
}
56-
/*document.Find("meta[property='og:url']").Each(func(i int, s *goquery.Selection) {
57-
if content, exists := s.Attr("content"); exists {
58-
downloadUrl = content
59-
downloadType = "video"
60-
}
61-
})*/
62-
return content, "", 0, "", "", ""
68+
if strings.Contains(url, "bilibili.com/festival/") {
69+
return content, author, publishedAt, "", url, VideoFileType
70+
}
71+
if strings.Contains(url, "audio/au") {
72+
return content, author, publishedAt, "", url, AudioFileType
73+
}
74+
75+
return content, author, publishedAt, "", "", ""
6376
}

templates/lizhi.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010

1111
func (t *Template) LizhiExtractorMetaInfo(url string, document *goquery.Document) (string, string, int64, string, string, string) {
1212
audioUrl := ""
13+
fileType := ""
1314
document.Find("script").Each(func(i int, s *goquery.Selection) {
1415
scriptContent, err := s.Html()
1516
if audioUrl == "" && err == nil && scriptContent != "" {
@@ -20,9 +21,10 @@ func (t *Template) LizhiExtractorMetaInfo(url string, document *goquery.Document
2021

2122
if len(matches) > 1 {
2223
audioUrl = matches[1]
24+
fileType = AudioFileType
2325
return
2426
}
2527
}
2628
})
27-
return "", "", 0, audioUrl, audioUrl, "audio"
29+
return "", "", 0, audioUrl, audioUrl, fileType
2830
}

templates/manybooks.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,13 @@ func (t *Template) ManyBooksDownloadType(urlStr string) (string, string, string)
3232
//ebook
3333
id := extractIDWithRegex(urlStr)
3434
downloadUrl := "https://library.manybooks.net/live/get-book/" + id + "/epub"
35-
return downloadUrl, id + ".epub", "ebook"
35+
return downloadUrl, id + ".epub", EbookFileType
3636
}
3737
if lastPart == "6" {
3838
//pdf
3939
id := extractIDWithRegex(urlStr)
4040
downloadUrl := "https://library.manybooks.net/live/get-book/" + id + "/pdf"
41-
return downloadUrl, id + ".pdf", "pdf"
41+
return downloadUrl, id + ".pdf", PdfFileType
4242
}
4343

4444
return "", "", ""

templates/podbean.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ func podBeanScrapContent(document *goquery.Document) string {
2323

2424
func (t *Template) PodBeanExtractorMetaInfo(url string, document *goquery.Document) (string, string, int64, string, string, string) {
2525
audioUrl := ""
26+
fileType := ""
2627
content := podBeanScrapContent(document)
2728
scriptSelector := "script[type=\"application/ld+json\"]"
2829
document.Find(scriptSelector).Each(func(i int, s *goquery.Selection) {
@@ -38,9 +39,10 @@ func (t *Template) PodBeanExtractorMetaInfo(url string, document *goquery.Docume
3839
associatedMediaDetail := associatedMediaData.(map[string]interface{})
3940
if contentUrl, ok := associatedMediaDetail["contentUrl"]; ok {
4041
audioUrl = contentUrl.(string)
42+
fileType = AudioFileType
4143
}
4244
}
4345
}
4446
})
45-
return content, "", 0, audioUrl, audioUrl, "audio"
47+
return content, "", 0, audioUrl, audioUrl, fileType
4648
}

templates/rumble.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ func (t *Template) RumbleExtractorMetaInfo(entryUrl string, document *goquery.Do
3030

3131
if embeddingUrl != "" {
3232
contents := "<iframe width='960' height='540' src='" + embeddingUrl + "' frameborder='0' referrerpolicy='no-referrer'></iframe>"
33-
return "", "", 0, contents, entryUrl, "video"
33+
return "", "", 0, contents, entryUrl, VideoFileType
3434
}
3535
return "", "", 0, "", "", ""
3636
}

templates/speaker.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ func spreakerScrapContent(document *goquery.Document) string {
2323
func (t *Template) SpreakerExtractorMetaInfo(url string, document *goquery.Document) (string, string, int64, string, string, string) {
2424
content := spreakerScrapContent(document)
2525
audioUrl := ""
26+
fileType := ""
2627
document.Find("meta[name='twitter:player']").Each(func(i int, s *goquery.Selection) {
2728
content, _ := s.Attr("content")
2829
//"https://widget.spreaker.com/player?episode_id=60819052&playlist=show&cover_image_url=https%3A%2F%2Fd3wo5wojvuv7l.cloudfront.net%2Fimages.spreaker.com%2Foriginal%2F551ce348940065825b0a755b58fdb5ae.jpg"
@@ -31,7 +32,8 @@ func (t *Template) SpreakerExtractorMetaInfo(url string, document *goquery.Docum
3132
if startIndex > -1 && endIndex > -1 {
3233
episodeID := content[startIndex : startIndex+endIndex]
3334
audioUrl = "https://api.spreaker.com/v2/episodes/" + episodeID + "/ondemand.mp3"
35+
fileType = AudioFileType
3436
}
3537
})
36-
return content, "", 0, audioUrl, audioUrl, "audio"
38+
return content, "", 0, audioUrl, audioUrl, fileType
3739
}

0 commit comments

Comments
 (0)