isdn-python/isdn/parser.py

19 lines
535 B
Python
Raw Permalink Normal View History

2023-03-26 04:57:18 +09:00
import re
from typing import IO, Iterator
from lxml import etree
2023-04-08 03:32:34 +09:00
namespaces = {"sitemap": "http://www.sitemaps.org/schemas/sitemap/0.9"}
2023-03-26 04:57:18 +09:00
2023-04-08 03:32:34 +09:00
class ISDNJpSitemapXMLParser:
2023-03-26 04:57:18 +09:00
@staticmethod
def parse_list(file: str | IO) -> Iterator[str]:
for event, elm in etree.iterparse(
file, events=("end",), tag=[f"{{{namespaces['sitemap']}}}loc"], remove_blank_text=True
):
m = re.match(r"https://isdn.jp/(\d{13})", elm.text)
2023-03-26 04:57:18 +09:00
if not m:
continue
yield m.group(1)