2023-03-26 04:57:18 +09:00
|
|
|
import re
|
|
|
|
from typing import IO, Iterator
|
|
|
|
|
|
|
|
from lxml import etree
|
|
|
|
|
2023-04-08 03:32:34 +09:00
|
|
|
namespaces = {"sitemap": "http://www.sitemaps.org/schemas/sitemap/0.9"}
|
2023-03-26 04:57:18 +09:00
|
|
|
|
|
|
|
|
2023-04-08 03:32:34 +09:00
|
|
|
class ISDNJpSitemapXMLParser:
|
2023-03-26 04:57:18 +09:00
|
|
|
@staticmethod
|
|
|
|
def parse_list(file: str | IO) -> Iterator[str]:
|
|
|
|
for event, elm in etree.iterparse(
|
|
|
|
file, events=("end",), tag=[f"{{{namespaces['sitemap']}}}loc"], remove_blank_text=True
|
|
|
|
):
|
2023-04-03 04:16:28 +09:00
|
|
|
m = re.match(r"https://isdn.jp/(\d{13})", elm.text)
|
2023-03-26 04:57:18 +09:00
|
|
|
if not m:
|
|
|
|
continue
|
|
|
|
yield m.group(1)
|