Merge pull request #47 from beclab/feat/refactor

kaki-admin · web-flow · commit 72a05530e8c2 · 2025-08-28T20:37:57.000+08:00
feat: file type change
diff --git a/processor/processor.go b/processor/processor.go
@@ -14,7 +14,7 @@ import (
 	"github.com/beclab/article-extractor/templates/postExtractor"
 )
 
-// 得到content内容，主要在推荐算法爬取页面后解析正文内容
+// Obtain the content, primarily by parsing the main body after crawling the page with the recommendation algorithm
 func ArticleContentExtractor(rawContent, entryUrl string) (string, string) {
 	entryDomain := domain(entryUrl)
 	templateRawData := strings.NewReader(rawContent)
@@ -56,8 +56,8 @@ func ArticleContentExtractor(rawContent, entryUrl string) (string, string) {
 	return content, pureContent
 }
 
-// 根据url，不用正文内容获得下载信息
-// 对于ebook和pdf 通过url来解析，不需要爬取页面
+// Obtain download information based on the URL, without using the main content.
+// Parse eBook and PDF information directly from the URL, without crawling the page.
 func DownloadTypeQueryByUrl(url string) (string, string, string) {
 	funcs := reflect.ValueOf(&templates.Template{})
 	_, mediaRule := getDownloadTypeByUrlRules(url)
@@ -72,7 +72,7 @@ func DownloadTypeQueryByUrl(url string) (string, string, string) {
 	return "", "", ""
 }
 
-// 根据模版表获得正文,作者，发布时间，以及下载信息
+// Obtain the main content, author, publication date, and download information based on the template table.
 func MetaDataQueryByTemplate(entryUrl, rawContent string, doc *goquery.Document) (string, string, int64, string, string, string) {
 	var content string
 	var author string
@@ -109,8 +109,7 @@ func MetaDataQueryByTemplate(entryUrl, rawContent string, doc *goquery.Document)
 	return content, author, publishedAt, mediaContent, downloadUrl, downloadType
 }
 
-// 输入url，rawcontent
-// 输出entry的metadata
+// Input the URL and raw content, and output the metadata of the entry.
 func ArticleExtractor(rawContent, entryUrl string) (string, string, *time.Time, string, string, string, int64, string, string, string) {
 	templateRawData := strings.NewReader(rawContent)
 	doc, _ := goquery.NewDocumentFromReader(templateRawData)
diff --git a/processor/rules.go b/processor/rules.go
@@ -144,7 +144,7 @@ var contentTemplatePredefinedRules = map[string]string{
 	"zhihu.com":              "ZhihuExtractorMetaInfo",
 }
 
-var DownloadTypeUrlTemplatedRules = map[string]string{
+var downloadTypeUrlTemplatedRules = map[string]string{
 	"manybooks.net":      "ManyBooksDownloadType", //need cookies
 	"standardebooks.org": "StandardebooksDownloadType",
 	"z-library.gs":       "ZLibraryDownloadType", //need cookies
@@ -179,7 +179,7 @@ func getContentPostExtractorTemplateRules(websiteURL string) string {
 
 func getDownloadTypeByUrlRules(websiteURL string) (string, string) {
 	urlDomain := domain(websiteURL)
-	for domain, rules := range DownloadTypeUrlTemplatedRules {
+	for domain, rules := range downloadTypeUrlTemplatedRules {
 		if strings.Contains(urlDomain, domain) {
 			return domain, rules
 		}
diff --git a/templates/acfun.go b/templates/acfun.go
@@ -36,5 +36,5 @@ func (t *Template) ACFunExtractorMetaInfo(url string, document *goquery.Document
 		author = s.Text()
 	})
 	publishedAt := acFunScrapPublishedAt(document)
-	return content, author, publishedAt, "", url, "video"
+	return content, author, publishedAt, "", url, VideoFileType
 }
diff --git a/templates/base.go b/templates/base.go
@@ -6,6 +6,7 @@ import (
 	"regexp"
 	"strconv"
 	"strings"
+	"time"
 
 	"github.com/PuerkitoBio/goquery"
 	"github.com/beclab/article-extractor/readability"
@@ -19,6 +20,32 @@ type ExtractorFileInfo struct {
 	FileType    string
 }
 
+const (
+	ShanghaiTZ = "Asia/Shanghai"
+	UrumqiTZ   = "Asia/Urumqi"
+)
+
+const (
+	VideoFileType = "video"
+	AudioFileType = "audio"
+	PdfFileType   = "pdf"
+	EbookFileType = "ebook"
+)
+
+func ParseLocationTimestamp(timeStr, layout string, location string) (int64, error) {
+	loc, err := time.LoadLocation(location)
+	if err != nil {
+		return 0, err
+	}
+
+	t, err := time.ParseInLocation(layout, timeStr, loc)
+	if err != nil {
+		return 0, err
+	}
+
+	return t.Unix(), nil
+}
+
 func GetArticleByDivClass(document *goquery.Document) string {
 	content := ""
 	document.Find("div.entry-content,div.content-entry,div.article-detail,div.entry,div.entry__content,div.article__content,div.articleContent").Each(func(i int, s *goquery.Selection) {
diff --git a/templates/bilibili.go b/templates/bilibili.go
@@ -10,7 +10,7 @@ import (
 
 func bilibiliScrapContent(document *goquery.Document) string {
 	contents := ""
-	document.Find("span.desc-info-text").Each(func(i int, s *goquery.Selection) {
+	document.Find("span.desc-info-text,div.opus-module-content").Each(func(i int, s *goquery.Selection) {
 		var content string
 		content, _ = goquery.OuterHtml(s)
 		contents += content
@@ -36,8 +36,20 @@ func bilibiliScrapContent(document *goquery.Document) string {
 }
 
 func (t *Template) BilibiliExtractorMetaInfo(url string, document *goquery.Document) (string, string, int64, string, string, string) {
-	bvid := ""
+	author := ""
+	document.Find("div.fixed-author-header__author__name,div.opus-module-author__name").Each(func(i int, s *goquery.Selection) {
+		author = strings.TrimSpace(s.Text())
+	})
+	var publishedAt int64 = 0
+	document.Find("div.opus-module-author__pub__text").Each(func(i int, s *goquery.Selection) {
+		publishTimes := s.Text()
+		layout := "2006年01月02日 15:04"
+		publishTimes = strings.TrimPrefix(publishTimes, "编辑于 ")
+		publishedAt, _ = ParseLocationTimestamp(publishTimes, layout, ShanghaiTZ)
+	})
 	content := bilibiliScrapContent(document)
+
+	bvid := ""
 	document.Find("meta[itemprop=url]").Each(func(i int, s *goquery.Selection) {
 		if content, exists := s.Attr("content"); exists {
 			videoPattern := `video/(\w+)`
@@ -51,13 +63,14 @@ func (t *Template) BilibiliExtractorMetaInfo(url string, document *goquery.Docum
 	if bvid != "" {
 		embeddingUrl := "https://www.bilibili.com/blackboard/html5mobileplayer.html?bvid=" + bvid + "&amp;high_quality=1&amp;autoplay=0"
 		contents := "<iframe width='910' height='668' src='" + embeddingUrl + "'  border='0' scrolling='no' border='0 frameborder='no' framespacing='0' allowfullscreen='true' referrerpolicy='no-referrer'></iframe>"
-		return content, "", 0, contents, url, "video"
+		return content, author, publishedAt, contents, url, VideoFileType
 	}
-	/*document.Find("meta[property='og:url']").Each(func(i int, s *goquery.Selection) {
-		if content, exists := s.Attr("content"); exists {
-			downloadUrl = content
-			downloadType = "video"
-		}
-	})*/
-	return content, "", 0, "", "", ""
+	if strings.Contains(url, "bilibili.com/festival/") {
+		return content, author, publishedAt, "", url, VideoFileType
+	}
+	if strings.Contains(url, "audio/au") {
+		return content, author, publishedAt, "", url, AudioFileType
+	}
+
+	return content, author, publishedAt, "", "", ""
 }
diff --git a/templates/lizhi.go b/templates/lizhi.go
@@ -10,6 +10,7 @@ import (
 
 func (t *Template) LizhiExtractorMetaInfo(url string, document *goquery.Document) (string, string, int64, string, string, string) {
 	audioUrl := ""
+	fileType := ""
 	document.Find("script").Each(func(i int, s *goquery.Selection) {
 		scriptContent, err := s.Html()
 		if audioUrl == "" && err == nil && scriptContent != "" {
@@ -20,9 +21,10 @@ func (t *Template) LizhiExtractorMetaInfo(url string, document *goquery.Document
 
 			if len(matches) > 1 {
 				audioUrl = matches[1]
+				fileType = AudioFileType
 				return
 			}
 		}
 	})
-	return "", "", 0, audioUrl, audioUrl, "audio"
+	return "", "", 0, audioUrl, audioUrl, fileType
 }
diff --git a/templates/manybooks.go b/templates/manybooks.go
@@ -32,13 +32,13 @@ func (t *Template) ManyBooksDownloadType(urlStr string) (string, string, string)
 		//ebook
 		id := extractIDWithRegex(urlStr)
 		downloadUrl := "https://library.manybooks.net/live/get-book/" + id + "/epub"
-		return downloadUrl, id + ".epub", "ebook"
+		return downloadUrl, id + ".epub", EbookFileType
 	}
 	if lastPart == "6" {
 		//pdf
 		id := extractIDWithRegex(urlStr)
 		downloadUrl := "https://library.manybooks.net/live/get-book/" + id + "/pdf"
-		return downloadUrl, id + ".pdf", "pdf"
+		return downloadUrl, id + ".pdf", PdfFileType
 	}
 
 	return "", "", ""
diff --git a/templates/podbean.go b/templates/podbean.go
@@ -23,6 +23,7 @@ func podBeanScrapContent(document *goquery.Document) string {
 
 func (t *Template) PodBeanExtractorMetaInfo(url string, document *goquery.Document) (string, string, int64, string, string, string) {
 	audioUrl := ""
+	fileType := ""
 	content := podBeanScrapContent(document)
 	scriptSelector := "script[type=\"application/ld+json\"]"
 	document.Find(scriptSelector).Each(func(i int, s *goquery.Selection) {
@@ -38,9 +39,10 @@ func (t *Template) PodBeanExtractorMetaInfo(url string, document *goquery.Docume
 				associatedMediaDetail := associatedMediaData.(map[string]interface{})
 				if contentUrl, ok := associatedMediaDetail["contentUrl"]; ok {
 					audioUrl = contentUrl.(string)
+					fileType = AudioFileType
 				}
 			}
 		}
 	})
-	return content, "", 0, audioUrl, audioUrl, "audio"
+	return content, "", 0, audioUrl, audioUrl, fileType
 }
diff --git a/templates/rumble.go b/templates/rumble.go
@@ -30,7 +30,7 @@ func (t *Template) RumbleExtractorMetaInfo(entryUrl string, document *goquery.Do
 
 	if embeddingUrl != "" {
 		contents := "<iframe width='960' height='540' src='" + embeddingUrl + "'  frameborder='0'  referrerpolicy='no-referrer'></iframe>"
-		return "", "", 0, contents, entryUrl, "video"
+		return "", "", 0, contents, entryUrl, VideoFileType
 	}
 	return "", "", 0, "", "", ""
 }
diff --git a/templates/speaker.go b/templates/speaker.go
@@ -23,6 +23,7 @@ func spreakerScrapContent(document *goquery.Document) string {
 func (t *Template) SpreakerExtractorMetaInfo(url string, document *goquery.Document) (string, string, int64, string, string, string) {
 	content := spreakerScrapContent(document)
 	audioUrl := ""
+	fileType := ""
 	document.Find("meta[name='twitter:player']").Each(func(i int, s *goquery.Selection) {
 		content, _ := s.Attr("content")
 		//"https://widget.spreaker.com/player?episode_id=60819052&playlist=show&cover_image_url=https%3A%2F%2Fd3wo5wojvuv7l.cloudfront.net%2Fimages.spreaker.com%2Foriginal%2F551ce348940065825b0a755b58fdb5ae.jpg"
@@ -31,7 +32,8 @@ func (t *Template) SpreakerExtractorMetaInfo(url string, document *goquery.Docum
 		if startIndex > -1 && endIndex > -1 {
 			episodeID := content[startIndex : startIndex+endIndex]
 			audioUrl = "https://api.spreaker.com/v2/episodes/" + episodeID + "/ondemand.mp3"
+			fileType = AudioFileType
 		}
 	})
-	return content, "", 0, audioUrl, audioUrl, "audio"
+	return content, "", 0, audioUrl, audioUrl, fileType
 }
diff --git a/templates/standardebooks.go b/templates/standardebooks.go
@@ -33,7 +33,7 @@ func (t *Template) StandardebooksDownloadType(url string, document *goquery.Docu
 	if isEpubURL(url) {
 		fileName := extractStandardebooksName(url)
 		url = url + "?source=download"
-		return url, fileName, "ebook"
+		return url, fileName, EbookFileType
 	}
 	return "", "", ""
 
diff --git a/templates/storyfm.go b/templates/storyfm.go
@@ -18,8 +18,10 @@ func storyFMScrapContent(document *goquery.Document) string {
 func (t *Template) StoryFMExtractorMetaInfo(url string, document *goquery.Document) (string, string, int64, string, string, string) {
 	content := storyFMScrapContent(document)
 	audioUrl := ""
+	fileType := ""
 	document.Find("audio.sf-audio > source").Each(func(i int, s *goquery.Selection) {
 		audioUrl, _ = s.Attr("src")
+		fileType = AudioFileType
 	})
-	return content, "", 0, audioUrl, audioUrl, "audio"
+	return content, "", 0, audioUrl, audioUrl, fileType
 }
diff --git a/templates/vimeo.go b/templates/vimeo.go
@@ -44,7 +44,7 @@ func (t *Template) VimeoExtractorMetaInfo(url string, document *goquery.Document
 			videoID := match[1]
 			embedUrl := "https://player.vimeo.com/video/" + videoID
 			contents := "<iframe width='896' height='504' src='" + embedUrl + "' frameborder='0' referrerpolicy='no-referrer'></iframe>"
-			return content, "", 0, contents, url, "video"
+			return content, "", 0, contents, url, VideoFileType
 		}
 
 	}
diff --git a/templates/xiaoyuzhoufm.go b/templates/xiaoyuzhoufm.go
@@ -46,10 +46,12 @@ func (t *Template) XiaoyuzhouFMExtractorMetaInfo(url string, document *goquery.D
 	content := xiaoyuzhouScrapContent(document)
 	author := xiaoyuzhouScrapAuthor(document)
 	audioUrl := ""
+	fileType := ""
 	document.Find("meta[property='og:audio']").Each(func(i int, s *goquery.Selection) {
 		if content, exists := s.Attr("content"); exists {
 			audioUrl = content
+			fileType = AudioFileType
 		}
 	})
-	return content, author, 0, audioUrl, audioUrl, "audio"
+	return content, author, 0, audioUrl, audioUrl, fileType
 }
diff --git a/templates/ximalaya.go b/templates/ximalaya.go
@@ -18,8 +18,10 @@ func ximalayaScrapContent(document *goquery.Document) string {
 func (t *Template) XimalayaExtractorMetaInfo(url string, document *goquery.Document) (string, string, int64, string, string, string) {
 	content := ximalayaScrapContent(document)
 	author := ""
+	fileType := ""
 	document.Find("a.albumTitle").Each(func(i int, s *goquery.Selection) {
 		author = s.Text()
+		fileType = AudioFileType
 	})
-	return content, author, 0, url, url, "audio"
+	return content, author, 0, url, url, fileType
 }
diff --git a/templates/youtube.go b/templates/youtube.go
@@ -15,7 +15,7 @@ func (t *Template) YoutubeExtractorMetaInfo(url string, document *goquery.Docume
 			videoID := match[1]
 			embedUrl := "https://www.youtube.com/embed/gfx7mTmWdYU?si=" + videoID
 			contents := "<iframe width='840' height='472' src='" + embedUrl + "'  frameborder='0' allow='accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share' referrerpolicy='strict-origin-when-cross-origin' allowfullscreen></iframe>"
-			return "", "", 0, contents, url, "video"
+			return "", "", 0, contents, url, VideoFileType
 		}
 
 	}
diff --git a/templates/zlibrary.go b/templates/zlibrary.go
@@ -49,7 +49,7 @@ func (t *Template) ZLibraryDownloadType(url string) (string, string, string) {
 		return "", "", ""
 	}
 	if matched {
-		return url, extractZLibraryIDWithRegex(url) + ".epub", "ebook"
+		return url, extractZLibraryIDWithRegex(url) + ".epub", EbookFileType
 	}
 	return "", "", ""
 

Original file line number	Diff line number	Diff line change
`@@ -36,5 +36,5 @@ func (t Template) ACFunExtractorMetaInfo(url string, document goquery.Document`
`36`	`36`	`author = s.Text()`
`37`	`37`	`})`
`38`	`38`	`publishedAt := acFunScrapPublishedAt(document)`
`39`		`- return content, author, publishedAt, "", url, "video"`
	`39`	`+ return content, author, publishedAt, "", url, VideoFileType`
`40`	`40`	`}`
Original file line number	Diff line number	Diff line change
`@@ -10,6 +10,7 @@ import (`
`10`	`10`
`11`	`11`	`func (t Template) LizhiExtractorMetaInfo(url string, document goquery.Document) (string, string, int64, string, string, string) {`
`12`	`12`	`audioUrl := ""`
	`13`	`+ fileType := ""`
`13`	`14`	`document.Find("script").Each(func(i int, s *goquery.Selection) {`
`14`	`15`	`scriptContent, err := s.Html()`
`15`	`16`	`if audioUrl == "" && err == nil && scriptContent != "" {`
`@@ -20,9 +21,10 @@ func (t Template) LizhiExtractorMetaInfo(url string, document goquery.Document`
`20`	`21`
`21`	`22`	`if len(matches) > 1 {`
`22`	`23`	`audioUrl = matches[1]`
	`24`	`+ fileType = AudioFileType`
`23`	`25`	`return`
`24`	`26`	`}`
`25`	`27`	`}`
`26`	`28`	`})`
`27`		`- return "", "", 0, audioUrl, audioUrl, "audio"`
	`29`	`+ return "", "", 0, audioUrl, audioUrl, fileType`
`28`	`30`	`}`
Original file line number	Diff line number	Diff line change
`@@ -23,6 +23,7 @@ func podBeanScrapContent(document *goquery.Document) string {`
`23`	`23`
`24`	`24`	`func (t Template) PodBeanExtractorMetaInfo(url string, document goquery.Document) (string, string, int64, string, string, string) {`
`25`	`25`	`audioUrl := ""`
	`26`	`+ fileType := ""`
`26`	`27`	`content := podBeanScrapContent(document)`
`27`	`28`	`scriptSelector := "script[type=\"application/ld+json\"]"`
`28`	`29`	`document.Find(scriptSelector).Each(func(i int, s *goquery.Selection) {`
`@@ -38,9 +39,10 @@ func (t Template) PodBeanExtractorMetaInfo(url string, document goquery.Docume`
`38`	`39`	`associatedMediaDetail := associatedMediaData.(map[string]interface{})`
`39`	`40`	`if contentUrl, ok := associatedMediaDetail["contentUrl"]; ok {`
`40`	`41`	`audioUrl = contentUrl.(string)`
	`42`	`+ fileType = AudioFileType`
`41`	`43`	`}`
`42`	`44`	`}`
`43`	`45`	`}`
`44`	`46`	`})`
`45`		`- return content, "", 0, audioUrl, audioUrl, "audio"`
	`47`	`+ return content, "", 0, audioUrl, audioUrl, fileType`
`46`	`48`	`}`
Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@ func (t Template) RumbleExtractorMetaInfo(entryUrl string, document goquery.Do`
`30`	`30`
`31`	`31`	`if embeddingUrl != "" {`
`32`	`32`	`contents := "<iframe width='960' height='540' src='" + embeddingUrl + "' frameborder='0' referrerpolicy='no-referrer'></iframe>"`
`33`		`- return "", "", 0, contents, entryUrl, "video"`
	`33`	`+ return "", "", 0, contents, entryUrl, VideoFileType`
`34`	`34`	`}`
`35`	`35`	`return "", "", 0, "", "", ""`
`36`	`36`	`}`