calibre-web: dtrpg-metadata Add DMs Guild and fix cover downloads

DriveThruRPG contains multiple stores, one of which is the DMs Guild
for Dungeons and Dragons. Items that are only for sale on the Guild
don't show up on DTRPG searches, and vice versa.

A lot of the images end up being redirected to an eventual file, but
Calibre-web doesn't follow redirects when saving the cover. Instead, we
follow them up front and return the eventual URL.
This commit is contained in:
Ross Hendry
2022-11-12 16:07:56 +00:00
parent 8a3b2c29c5
commit dd67799eb1
+173 -110
View File
@@ -13,7 +13,7 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from typing import Dict, List, Optional
from typing import Dict, List, Optional, cast
from urllib.parse import quote
from lxml import html
import requests
@@ -24,16 +24,132 @@ from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
log = logger.create()
HEADERS = {"User-Agent": "Not Evil Browser", "accept-encoding": "gzip"}
class DMSGuild(Metadata):
__name__ = "DMSGuild"
__id__ = "dmsguild"
DESCRIPTION = "DM's Guild"
META_URL = "https://www.dmsguild.com"
BASE_URL = f"{META_URL}/includes/ajax/search_autocomplete_jquery.php?term="
QUERY_PARAMS = "&json=true"
HEADERS = {"User-Agent": "Not Evil Browser", "accept-encoding": "gzip"}
def search(self, query: str, generic_cover: str = "", locale: str = "en"):
if not self.active:
return None
title_tokens = list(self.get_title_tokens(query, strip_joiners=False))
if title_tokens:
tokens = [quote(t.encode("utf-8")) for t in title_tokens]
query = "%20".join(tokens)
matches = _do_dtrpg_search(
query=f"{self.BASE_URL}{query}{self.QUERY_PARAMS}",
source=MetaSourceInfo(
id=self.__id__,
description=self.DESCRIPTION,
link=self.META_URL,
),
)
return matches
class DriveThruRpg(Metadata):
__name__ = "DriveThruRPG"
__id__ = "drivethrurpg"
DESCRIPTION = "DriveThru RPG"
META_URL = "https://www.drivethrurpg.com/"
BASE_URL = "https://www.drivethrurpg.com/includes/ajax/search_autocomplete_jquery.php?term="
META_URL = "https://www.drivethrurpg.com"
BASE_URL = f"{META_URL}/includes/ajax/search_autocomplete_jquery.php?term="
QUERY_PARAMS = "&json=true"
HEADERS = {"User-Agent": "Not Evil Browser", "accept-encoding": "gzip"}
def search(
self, query: str, generic_cover: str = "", locale: str = "en"
) -> Optional[List[MetaRecord]]:
if not self.active:
return None
title_tokens = list(self.get_title_tokens(query, strip_joiners=False))
if title_tokens:
tokens = [quote(t.encode("utf-8")) for t in title_tokens]
query = "%20".join(tokens)
matches = _do_dtrpg_search(
query=f"{self.BASE_URL}{query}{self.QUERY_PARAMS}",
source=MetaSourceInfo(
id=self.__id__,
description=self.DESCRIPTION,
link=self.META_URL,
),
)
return matches
def _do_dtrpg_search(query: str, source: MetaSourceInfo) -> List[MetaRecord]:
try:
log.info(f"Requesting data from: {query}")
result = requests.get(
query,
headers=HEADERS,
)
result.raise_for_status()
except Exception as e:
log.warning(e)
return list()
# If there are no hits we see a single element being returned with the easiest
# identifier being the link.
results_list: list = result.json()
if len(results_list) == 1 and results_list[0]["link"] == "#":
log.info("No results found")
return list()
# Since we'll go on to do N further requests for more information,
# we'll cut it off at the first five results here. Any sufficiently well
# populated search by title should be enough
results: List[MetaRecord] = list()
for r in results_list[0:5]:
assert isinstance(r, dict)
match = _fetch_dtrpg_search_result(result=r, source=source)
identifiers = {}
identifiers[source.id] = match.id
match.identifiers = identifiers
results.append(match)
return results
def _fetch_dtrpg_search_result(result: Dict, source: MetaSourceInfo) -> MetaRecord:
match = MetaRecord(
id=result["name"],
title=result["name"],
authors=[],
url=result.get("link", ""),
source=source,
)
try:
details_result = requests.get(
result["link"],
headers=HEADERS,
)
details_result.raise_for_status()
except Exception as e:
log.warning(e)
return match
_parse_dtrpg_result(details_result.content, match)
return match
def _parse_dtrpg_result(content: bytes, match: MetaRecord):
AUTHORS_XPATH = "//div[@class='widget-information-wrapper']//div[@class='widget-information-item-title' and contains(text(), 'Author(s)')]"
RULE_SYSTEMS_XPATH = "//div[@class='widget-information-wrapper']//div[@class='widget-information-item-title' and contains(text(), 'Rule System(s)')]"
PUBLISHER_XPATH = "//div[@class='widget-information-wrapper-2']//div[@class='widget-information-title' and contains(text(), 'Publisher')]"
@@ -41,118 +157,65 @@ class DriveThruRpg(Metadata):
DESCRIPTION_XPATH = "//div[contains(@class,'prod-content')]//text()"
IMAGE_PROP_XPATH = "//meta[@itemprop='image']/@content"
def search(
self, query: str, generic_cover: str = "", locale: str = "en"
) -> Optional[List[MetaRecord]]:
val = list()
if self.active:
title_tokens = list(self.get_title_tokens(query, strip_joiners=False))
if title_tokens:
tokens = [quote(t.encode("utf-8")) for t in title_tokens]
query = "%20".join(tokens)
data = html.fromstring(content)
try:
result = requests.get(
f"{DriveThruRpg.BASE_URL}{query}{DriveThruRpg.QUERY_PARAMS}",
headers=DriveThruRpg.HEADERS,
)
result.raise_for_status()
except Exception as e:
log.warning(e)
return None
# Use the big text field as description as the meta tag is very short
description_field = data.xpath(DESCRIPTION_XPATH)
assert isinstance(description_field, List)
if description_field is not None:
match.description = "".join(description_field).strip() # type: ignore
# Since we'll do on to do N further requests for more information,
# we'll cut it off at the first five results here. Any sufficiently well
# populated search by title should be enough
for r in result.json()[0:5]:
assert isinstance(r, dict)
match = self._parse_search_result(
result=r, generic_cover=generic_cover, locale=locale
)
val.append(match)
return val
product_url = data.xpath(URL_PROP_XPATH)
assert isinstance(product_url, List)
if product_url is not None and len(product_url) > 0:
match.url = cast(str, product_url[0])
def _parse_search_result(
self, result: Dict, generic_cover: str, locale: str
) -> MetaRecord:
match = MetaRecord(
id=result["name"],
title=result["name"],
authors=[],
url=result.get("link", ""),
source=MetaSourceInfo(
id=self.__id__,
description=DriveThruRpg.DESCRIPTION,
link=DriveThruRpg.META_URL,
),
# We can get a better ID from the URL
regex = r".*\/product\/(\d+)\/.*"
matches = re.findall(regex, match.url)
if len(matches) > 0:
match.id = matches[0]
image_url = data.xpath(IMAGE_PROP_XPATH)
assert isinstance(image_url, List)
if image_url is not None and len(image_url) > 0:
# Calibre web doesn't follow redirects and reports some covers as an error
log.info(f"Cover URL is {image_url[0]}")
r = requests.head(image_url[0], allow_redirects=True)
log.info(f"After following redirects, it is {r.url}")
match.cover = cast(str, r.url)
# Find authors
for div in cast(List, data.xpath(AUTHORS_XPATH)):
# Just bring in elements that look like they might be authors.
authors = list(
filter(
lambda x: re.match(r"^\w[\w\s]+$", x),
div.getnext().xpath(".//text()"),
)
)
match.authors = authors
try:
details_result = requests.get(
result["link"],
headers=DriveThruRpg.HEADERS,
# Use rule systems as tags
match.tags = ["RPG"]
for div in cast(list, data.xpath(RULE_SYSTEMS_XPATH)):
rule_systems = list(
filter(
lambda x: len(x.strip()) > 0,
div.getnext().xpath(".//text()"),
)
details_result.raise_for_status()
except Exception as e:
log.warning(e)
return match
)
match.tags.extend(rule_systems)
data = html.fromstring(details_result.content)
# Use the big text field as description as the meta tag is very short
description_field = data.xpath(self.DESCRIPTION_XPATH)
if description_field is not None:
match.description = "".join(description_field).strip()
product_url = data.xpath(self.URL_PROP_XPATH)
if product_url is not None and len(product_url) > 0:
match.url = product_url[0]
# We can get a better ID from the URL
regex = r".*\/product\/(\d+)\/.*"
matches = re.findall(regex, match.url)
if len(matches) > 0:
match.id = matches[0]
image_url = data.xpath(self.IMAGE_PROP_XPATH)
if image_url is not None and len(image_url) > 0:
match.cover = image_url[0]
# Find authors
for div in data.xpath(self.AUTHORS_XPATH):
# Just bring in elements that look like they might be authors.
authors = list(
filter(
lambda x: re.match(r"^\w[\w\s]+$", x),
div.getnext().xpath(".//text()"),
)
for div in cast(List, data.xpath(PUBLISHER_XPATH)):
publisher_link = div.getnext().xpath(".//a")
# Sometimes we get a link, other times it's text in a different element.
if publisher_link is not None and len(publisher_link) > 0:
match.publisher = publisher_link[0].text_content().strip()
else:
publisher_name = div.getnext().xpath(
".//div[@class='widget-information-item-title']"
)
match.authors = authors
match.publisher = publisher_name[0].text_content().strip()
# Use rule systems as tags
match.tags = ["RPG"]
for div in data.xpath(self.RULE_SYSTEMS_XPATH):
rule_systems = list(
filter(
# lambda x: re.match(r"^\w[()\w\s]+$", x),
lambda x: len(x.strip()) > 0,
div.getnext().xpath(".//text()"),
)
)
match.tags.extend(rule_systems)
for div in data.xpath(self.PUBLISHER_XPATH):
publisher_link = div.getnext().xpath(".//a")
# Sometimes we get a link, other times it's text in a different element.
if publisher_link is not None and len(publisher_link) > 0:
match.publisher = publisher_link[0].text_content().strip()
else:
publisher_name = div.getnext().xpath(
".//div[@class='widget-information-item-title']"
)
match.publisher = publisher_name[0].text_content().strip()
# match.publishedDate = result.get("store_date", result.get("date_added"))
match.identifiers = {"drivethrurpg": match.id}
return match
return match