From d7fa51f7d5328e382247aee85c2fa400325175f5 Mon Sep 17 00:00:00 2001 From: Babibubebon Date: Mon, 3 Apr 2023 04:16:28 +0900 Subject: [PATCH] Add write-image option to bulk-download command --- isdn/client.py | 25 ++++++++++++++++++------- isdn/command.py | 29 ++++++++++++++++++++++++----- isdn/parser.py | 2 +- 3 files changed, 43 insertions(+), 13 deletions(-) diff --git a/isdn/client.py b/isdn/client.py index fb77bca..b35fd22 100644 --- a/isdn/client.py +++ b/isdn/client.py @@ -5,13 +5,20 @@ import requests from . import ISDNRecord, __version__ from .parser import ISDNJpXMLParser -ISDN_API_ENDPOINT = "https://isdn.jp/xml/" +ISDN_XML_ENDPOINT = "https://isdn.jp/xml/{isdn}" +ISDN_IMAGE_ENDPOINT = "https://isdn.jp/images/thumbs/{isdn}.png" ISDN_SITEMAP = "https://isdn.jp/sitemap.xml" class ISDNClient: - def __init__(self, endpoint_url: str = ISDN_API_ENDPOINT, sitemap_url: str = ISDN_SITEMAP): - self.endpoint_url = endpoint_url + def __init__( + self, + xml_endpoint_url: str = ISDN_XML_ENDPOINT, + image_endpoint_url: str = ISDN_IMAGE_ENDPOINT, + sitemap_url: str = ISDN_SITEMAP, + ): + self.xml_endpoint_url = xml_endpoint_url + self.image_endpoint_url = image_endpoint_url self.sitemap_url = sitemap_url self.s = requests.Session() self.set_user_agent(f"isdn-python/{__version__}") @@ -23,17 +30,21 @@ class ISDNClient: def normalize_isdn(isdn: str) -> str: return isdn.replace("-", "").strip() - def _get(self, isdn: str) -> requests.Response: - r = self.s.get(self.endpoint_url + self.normalize_isdn(isdn)) + def _get(self, isdn: str, endpoint_url: str) -> requests.Response: + r = self.s.get(endpoint_url.format(isdn=self.normalize_isdn(isdn))) r.raise_for_status() return r def get(self, isdn: str) -> ISDNRecord: - r = self._get(isdn) + r = self._get(isdn, self.xml_endpoint_url) return ISDNJpXMLParser.parse_record(r.content) def get_raw(self, isdn: str) -> bytes: - r = self._get(isdn) + r = self._get(isdn, self.xml_endpoint_url) + return r.content + + def get_image(self, isdn: str) -> bytes: + r = self._get(isdn, self.image_endpoint_url) return r.content def _list(self) -> requests.Response: diff --git a/isdn/command.py b/isdn/command.py index 4c4c5c4..0c08b57 100644 --- a/isdn/command.py +++ b/isdn/command.py @@ -7,6 +7,7 @@ from requests.exceptions import HTTPError from . import ISDN, __version__ from .client import ISDNClient +from .parser import ISDNJpXMLParser @click.group() @@ -46,24 +47,42 @@ def list_isdns(): @cli.command(help="Download all xml record files from isdn.jp") @click.argument("directory", type=click.Path(exists=True, file_okay=False, writable=True)) @click.option("--force", "-f", is_flag=True, help="Overwrite existing files") -@click.option("--stop-on-error", is_flag=True, help="Stops when error occurs during download") +@click.option("--stop-on-error", is_flag=True, help="Stop when error occurs during download") @click.option("--sleep-time", "-s", type=int, default=500, help="Sleep interval for download (millisecond)") -def bulk_download(directory: str, force: bool, stop_on_error: bool, sleep_time: int): +@click.option("--write-image", is_flag=True, help="Write cover image to file") +@click.option( + "--write-image-path", + type=click.Path(exists=True, file_okay=False, writable=True), + help="Directory path to write cover images", +) +def bulk_download( + directory: str, force: bool, stop_on_error: bool, sleep_time: int, write_image: bool, write_image_path: str +): c = ISDNClient() with click.progressbar(list(c.list()), show_pos=True) as bar: for isdn in bar: path = os.path.join(directory, f"{isdn}.xml") - if not force and os.path.exists(path): + image_path = os.path.join(write_image_path or directory, f"{isdn}.png") + if not force and os.path.exists(path) and (not write_image or write_image and os.path.exists(image_path)): continue + try: res = c.get_raw(isdn) + with open(path, "wb") as out: + out.write(res) + + if write_image: + record = ISDNJpXMLParser.parse_record(res) + if record.sample_image_uri: + img = c.get_image(isdn) + with open(image_path, "wb") as out: + out.write(img) except HTTPError as err: if stop_on_error: raise err else: continue - with open(path, "wb") as out: - out.write(res) + time.sleep(sleep_time / 1000) diff --git a/isdn/parser.py b/isdn/parser.py index 1ab3cca..c7aba07 100644 --- a/isdn/parser.py +++ b/isdn/parser.py @@ -53,7 +53,7 @@ class ISDNJpXMLParser: for event, elm in etree.iterparse( file, events=("end",), tag=[f"{{{namespaces['sitemap']}}}loc"], remove_blank_text=True ): - m = re.match("https://isdn.jp/(\d{13})", elm.text) + m = re.match(r"https://isdn.jp/(\d{13})", elm.text) if not m: continue yield m.group(1)