isdn-python/isdn/parser.py

18 lines
535 B
Python

import re
from typing import IO, Iterator
from lxml import etree
namespaces = {"sitemap": "http://www.sitemaps.org/schemas/sitemap/0.9"}
class ISDNJpSitemapXMLParser:
@staticmethod
def parse_list(file: str | IO) -> Iterator[str]:
for event, elm in etree.iterparse(
file, events=("end",), tag=[f"{{{namespaces['sitemap']}}}loc"], remove_blank_text=True
):
m = re.match(r"https://isdn.jp/(\d{13})", elm.text)
if not m:
continue
yield m.group(1)