Add douban metadate provider

2022-02-25 01:01:12 +08:00 · 2022-02-25 01:01:12 +08:00 · 86b779f39b
commit 86b779f39b
parent 8007e450b3
1 changed files with 175 additions and 0 deletions
--- a/cps/metadata_provider/douban.py
+++ b/cps/metadata_provider/douban.py
@ -0,0 +1,175 @@
+# -*- coding: utf-8 -*-
+
+#  This file is part of the Calibre-Web (https://github.com/janeczku/calibre-web)
+#    Copyright (C) 2022 xlivevil
+#
+#  This program is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 3 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program. If not, see <http://www.gnu.org/licenses/>.
+import re
+from concurrent import futures
+from typing import List, Optional
+
+import requests
+from html2text import HTML2Text
+from lxml import etree
+
+from cps import logger
+from cps.services.Metadata import Metadata, MetaRecord, MetaSourceInfo
+
+log = logger.create()
+
+
+def html2text(html: str) -> str:
+
+    h2t = HTML2Text()
+    h2t.body_width = 0
+    h2t.single_line_break = True
+    h2t.emphasis_mark = "*"
+    return h2t.handle(html)
+
+
+class Douban(Metadata):
+    __name__ = "豆瓣"
+    __id__ = "douban"
+    DESCRIPTION = "豆瓣"
+    META_URL = "https://book.douban.com/"
+    SEARCH_URL = "https://www.douban.com/j/search"
+
+    ID_PATTERN = re.compile(r"sid: (?P<id>\d+),")
+    AUTHORS_PATTERN = re.compile(r"作者|译者")
+    PUBLISHER_PATTERN = re.compile(r"出版社")
+    SUBTITLE_PATTERN = re.compile(r"副标题")
+    PUBLISHED_DATE_PATTERN = re.compile(r"出版年")
+    SERIES_PATTERN = re.compile(r"丛书")
+    IDENTIFIERS_PATTERN = re.compile(r"ISBN|统一书号")
+
+    TITTLE_XPATH = "//span[@property='v:itemreviewed']"
+    COVER_XPATH = "//a[@class='nbg']"
+    INFO_XPATH = "//*[@id='info']//span[@class='pl']"
+    TAGS_XPATH = "//a[contains(@class, 'tag')]"
+    DESCRIPTION_XPATH = "//div[@id='link-report']//div[@class='intro']"
+    RATING_XPATH = "//div[@class='rating_self clearfix']/strong"
+
+    session = requests.Session()
+    session.headers = {
+        'user-agent':
+            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56',
+    }
+
+    def search(
+        self, query: str, generic_cover: str = "", locale: str = "en"
+    ) -> Optional[List[MetaRecord]]:
+        if self.active:
+            log.debug(f"starting search {query} on douban")
+            if title_tokens := list(
+                self.get_title_tokens(query, strip_joiners=False)
+            ):
+                query = "+".join(title_tokens)
+
+            try:
+                r = self.session.get(
+                    self.SEARCH_URL, params={"cat": 1001, "q": query}
+                )
+                r.raise_for_status()
+
+            except Exception as e:
+                log.warning(e)
+                return None
+
+            results = r.json()
+            if results["total"] == 0:
+                return val
+
+            book_id_list = [
+                self.ID_PATTERN.search(item).group("id")
+                for item in results["items"][:10] if self.ID_PATTERN.search(item)
+            ]
+
+            with futures.ThreadPoolExecutor(max_workers=5) as executor:
+
+                fut = [
+                    executor.submit(self._parse_single_book, book_id, generic_cover)
+                    for book_id in book_id_list
+                ]
+                
+                val = [
+                    future.result() 
+                    for future in futures.as_completed(fut) if future.result()
+                ]
+
+        return val
+
+    def _parse_single_book(
+        self, id: str, generic_cover: str = ""
+    ) -> Optional[MetaRecord]:
+        url = f"https://book.douban.com/subject/{id}/"
+
+        try:
+            r = self.session.get(url)
+            r.raise_for_status()
+        except Exception as e:
+            log.warning(e)
+            return None
+
+        match = MetaRecord(
+            id=id,
+            title="",
+            authors=[],
+            url=url,
+            source=MetaSourceInfo(
+                id=self.__id__,
+                description=self.DESCRIPTION,
+                link=self.META_URL,
+            ),
+        )
+
+        html = etree.HTML(r.content.decode("utf8"))
+
+        match.title = html.xpath(self.TITTLE_XPATH)[0].text
+        match.cover = html.xpath(self.COVER_XPATH)[0].attrib["href"] or generic_cover
+        try:
+            rating_num = float(html.xpath(self.RATING_XPATH)[0].text.strip())
+        except ValueError:
+            rating_num = 0
+        match.rating = int(-1 * rating_num // 2 * -1) if rating_num else 0
+
+        tag_elements = html.xpath(self.TAGS_XPATH)
+        if len(tag_elements):
+            match.tags = [tag_element.text for tag_element in tag_elements]
+
+        description_element = html.xpath(self.DESCRIPTION_XPATH)
+        if len(description_element):
+            match.description = html2text(etree.tostring(
+                description_element[-1], encoding="utf8").decode("utf8"))
+
+        info = html.xpath(self.INFO_XPATH)
+
+        for element in info:
+            text = element.text
+            if self.AUTHORS_PATTERN.search(text):
+                next = element.getnext()
+                while next is not None and next.tag != "br":
+                    match.authors.append(next.text)
+                    next = next.getnext()
+            elif self.PUBLISHER_PATTERN.search(text):
+                match.publisher = element.tail.strip()
+            elif self.SUBTITLE_PATTERN.search(text):
+                match.title = f'{match.title}:' + element.tail.strip()
+            elif self.PUBLISHED_DATE_PATTERN.search(text):
+                match.publishedDate = element.tail.strip()
+            elif self.SUBTITLE_PATTERN.search(text):
+                match.series = element.getnext().text
+            elif i_type := self.IDENTIFIERS_PATTERN.search(text):
+                match.identifiers[i_type.group()] = element.tail.strip()
+
+        return match