mirror of
https://github.com/Babibubebon/gbizinfo-lod.git
synced 2024-09-22 22:54:21 +09:00
Support for output in nquads
This commit is contained in:
parent
bb2998f781
commit
fefb76e117
10 changed files with 93 additions and 29 deletions
|
@ -9,6 +9,7 @@ import click
|
||||||
from . import __version__
|
from . import __version__
|
||||||
from .client import GbizinfoClient
|
from .client import GbizinfoClient
|
||||||
from .mappers import *
|
from .mappers import *
|
||||||
|
from .mappers import RDFFormatType
|
||||||
|
|
||||||
|
|
||||||
@click.group()
|
@click.group()
|
||||||
|
@ -64,7 +65,7 @@ def download(work_dir: str, sleep: int):
|
||||||
click.echo("Retrieving Kihonjoho (IMI version)")
|
click.echo("Retrieving Kihonjoho (IMI version)")
|
||||||
with open(kihonjoho_imi_file, "w", encoding="utf-8") as f:
|
with open(kihonjoho_imi_file, "w", encoding="utf-8") as f:
|
||||||
writer = None
|
writer = None
|
||||||
for row in get_kihonjoho_imi(kihonjoho_csv_file, client):
|
for row in get_kihonjoho_imi(kihonjoho_csv_file, client, sleep):
|
||||||
if writer is None:
|
if writer is None:
|
||||||
writer = csv.DictWriter(f, fieldnames=row.keys())
|
writer = csv.DictWriter(f, fieldnames=row.keys())
|
||||||
writer.writeheader()
|
writer.writeheader()
|
||||||
|
@ -123,7 +124,15 @@ MAPPER_TYPES = [
|
||||||
@click.option(
|
@click.option(
|
||||||
"--output-dir", "-o", type=click.Path(exists=True, file_okay=False, writable=True)
|
"--output-dir", "-o", type=click.Path(exists=True, file_okay=False, writable=True)
|
||||||
)
|
)
|
||||||
def convert(work_dir: str, mappers: list[str], processes: int, output_dir: str):
|
@click.option(
|
||||||
|
"--format",
|
||||||
|
"-f",
|
||||||
|
type=click.Choice([v.name for v in RDFFormatType]),
|
||||||
|
default=RDFFormatType.nq.name,
|
||||||
|
)
|
||||||
|
def convert(
|
||||||
|
work_dir: str, mappers: list[str], processes: int, output_dir: str, format: str
|
||||||
|
):
|
||||||
if not mappers:
|
if not mappers:
|
||||||
mappers = MAPPER_TYPES
|
mappers = MAPPER_TYPES
|
||||||
if not output_dir:
|
if not output_dir:
|
||||||
|
@ -166,11 +175,11 @@ def convert(work_dir: str, mappers: list[str], processes: int, output_dir: str):
|
||||||
case _:
|
case _:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
output_file = os.path.join(output_dir, f"{m}.nt")
|
output_file = os.path.join(output_dir, f"{m}.{format}")
|
||||||
click.echo(f"output: {output_file}")
|
click.echo(f"output: {output_file}")
|
||||||
click.echo(f"Running {m} mapper ...")
|
click.echo(f"Running {m} mapper ...")
|
||||||
with open(output_file, "w") as f:
|
with open(output_file, "w") as f:
|
||||||
mapper.run(n_jobs=processes, output=f)
|
mapper.run(n_jobs=processes, output=f, format=RDFFormatType[format])
|
||||||
|
|
||||||
|
|
||||||
@cli.command(help="Fetch CSV data from OutputCSV endpoint")
|
@cli.command(help="Fetch CSV data from OutputCSV endpoint")
|
||||||
|
|
|
@ -2,13 +2,15 @@ import csv
|
||||||
import sys
|
import sys
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import IO, Iterator, Tuple, Union
|
from typing import IO, Iterator, Tuple, Union
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
from joblib import Parallel, delayed
|
from joblib import Parallel, delayed
|
||||||
from rdflib import BNode
|
from rdflib import BNode, URIRef
|
||||||
from rdflib import Literal as LiteralRdflib
|
from rdflib import Literal as LiteralRdflib
|
||||||
from rdflib.graph import _ObjectType, _PredicateType, _SubjectType, _TripleType
|
from rdflib.graph import _ObjectType, _PredicateType, _SubjectType, _TripleType
|
||||||
from rdflib.namespace import RDF
|
from rdflib.namespace import RDF
|
||||||
from rdflib.plugins.serializers.nt import _nt_row
|
from rdflib.plugins.serializers.nt import _nt_row
|
||||||
|
from rdflib.plugins.serializers.nquads import _nq_row
|
||||||
|
|
||||||
_TripleMapType = Tuple[
|
_TripleMapType = Tuple[
|
||||||
_SubjectType, _PredicateType, Union[str, _ObjectType, "BlankPredicateObjectMap"]
|
_SubjectType, _PredicateType, Union[str, _ObjectType, "BlankPredicateObjectMap"]
|
||||||
|
@ -16,6 +18,7 @@ _TripleMapType = Tuple[
|
||||||
_PredicateObjectType = Tuple[
|
_PredicateObjectType = Tuple[
|
||||||
_PredicateType, Union[str, _ObjectType, "BlankPredicateObjectMap"]
|
_PredicateType, Union[str, _ObjectType, "BlankPredicateObjectMap"]
|
||||||
]
|
]
|
||||||
|
RDFFormatType = Enum("RDFFormatType", ["nt", "nq"])
|
||||||
|
|
||||||
|
|
||||||
class Literal:
|
class Literal:
|
||||||
|
@ -66,8 +69,8 @@ def flatten_triple_map(triple_map: _TripleMapType) -> list[_TripleType]:
|
||||||
return triples
|
return triples
|
||||||
|
|
||||||
|
|
||||||
def serialize_triple(triple: _TripleMapType) -> str:
|
def serialize_triple(triple: _TripleMapType, graph: URIRef | None = None) -> str:
|
||||||
return _nt_row(triple)
|
return _nq_row(triple, graph) if graph else _nt_row(triple)
|
||||||
|
|
||||||
|
|
||||||
class CSV2RDFMapper(ABC):
|
class CSV2RDFMapper(ABC):
|
||||||
|
@ -80,33 +83,39 @@ class CSV2RDFMapper(ABC):
|
||||||
for row in reader:
|
for row in reader:
|
||||||
yield row
|
yield row
|
||||||
|
|
||||||
def run(self, n_jobs: int = -1, output: IO[str] = sys.stdout):
|
def run(
|
||||||
if n_jobs == 1:
|
self,
|
||||||
for row in self.iterator():
|
n_jobs: int = -1,
|
||||||
for triple_map in self.map_to_triples(self.preprocess(row)):
|
output: IO[str] = sys.stdout,
|
||||||
for triple in flatten_triple_map(triple_map):
|
format: RDFFormatType = RDFFormatType.nq,
|
||||||
output.write(serialize_triple(triple))
|
):
|
||||||
else:
|
def job(row: dict[str, str]) -> str:
|
||||||
|
return "".join(
|
||||||
def job(row: dict[str, str]) -> str:
|
[
|
||||||
return "".join(
|
serialize_triple(
|
||||||
[
|
triple,
|
||||||
serialize_triple(triple)
|
graph=self.graph if format == RDFFormatType.nq else None,
|
||||||
for triple_map in self.map_to_triples(self.preprocess(row))
|
)
|
||||||
for triple in flatten_triple_map(triple_map)
|
for triple_map in self.map_to_triples(self.preprocess(row))
|
||||||
]
|
for triple in flatten_triple_map(triple_map)
|
||||||
)
|
]
|
||||||
|
|
||||||
res = Parallel(n_jobs=n_jobs, return_as="generator_unordered", verbose=1)(
|
|
||||||
delayed(job)(row) for row in self.iterator()
|
|
||||||
)
|
)
|
||||||
for lines in res:
|
|
||||||
output.write(lines)
|
res = Parallel(n_jobs=n_jobs, return_as="generator_unordered", verbose=1)(
|
||||||
|
delayed(job)(row) for row in self.iterator()
|
||||||
|
)
|
||||||
|
for lines in res:
|
||||||
|
output.write(lines)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def preprocess(row: dict[str, str]) -> dict[str, str | None]:
|
def preprocess(row: dict[str, str]) -> dict[str, str | None]:
|
||||||
return {key: val if val != "" else None for key, val in row.items()}
|
return {key: val if val != "" else None for key, val in row.items()}
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def graph(self) -> URIRef:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def map_to_triples(row: dict[str, str]) -> list[_TripleMapType]:
|
def map_to_triples(row: dict[str, str]) -> list[_TripleMapType]:
|
||||||
|
@ -123,6 +132,7 @@ from .tokkyo import GbizInfoTokkyoMapper
|
||||||
from .zaimu import GbizInfoZaimuMapper
|
from .zaimu import GbizInfoZaimuMapper
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
|
"RDFFormatType",
|
||||||
"GbizInfoHojinMapper",
|
"GbizInfoHojinMapper",
|
||||||
"GbizInfoHojyokinMapper",
|
"GbizInfoHojyokinMapper",
|
||||||
"GbizInfoChotatsuMapper",
|
"GbizInfoChotatsuMapper",
|
||||||
|
|
|
@ -1,11 +1,16 @@
|
||||||
|
from rdflib import URIRef
|
||||||
from ..namespace import *
|
from ..namespace import *
|
||||||
from . import _TripleMapType, bpo
|
from . import _TripleMapType
|
||||||
from ._katsudo import GbizInfoKatsudoMapper
|
from ._katsudo import GbizInfoKatsudoMapper
|
||||||
|
|
||||||
|
|
||||||
class GbizInfoChotatsuMapper(GbizInfoKatsudoMapper):
|
class GbizInfoChotatsuMapper(GbizInfoKatsudoMapper):
|
||||||
"""調達情報"""
|
"""調達情報"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def graph(self) -> URIRef:
|
||||||
|
return URIRef("http://hojin-info.go.jp/graph/chotatsu")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def map_to_triples(row: dict[str, str]) -> list[_TripleMapType]:
|
def map_to_triples(row: dict[str, str]) -> list[_TripleMapType]:
|
||||||
ss = HJ_EXT[f"{row['ID-識別値']}_{row['キー情報']}"]
|
ss = HJ_EXT[f"{row['ID-識別値']}_{row['キー情報']}"]
|
||||||
|
|
|
@ -7,6 +7,10 @@ from . import CSV2RDFMapper, Literal, _TripleMapType, bpo
|
||||||
class GbizInfoHojinMapper(CSV2RDFMapper):
|
class GbizInfoHojinMapper(CSV2RDFMapper):
|
||||||
"""法人情報・法人基本情報"""
|
"""法人情報・法人基本情報"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def graph(self) -> URIRef:
|
||||||
|
return URIRef("http://hojin-info.go.jp/graph/hojin")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def map_to_triples(row: dict[str, str]) -> list[_TripleMapType]:
|
def map_to_triples(row: dict[str, str]) -> list[_TripleMapType]:
|
||||||
s = HJ_DATA[row["ID-識別値"]]
|
s = HJ_DATA[row["ID-識別値"]]
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
from rdflib import URIRef
|
||||||
|
|
||||||
from ..namespace import *
|
from ..namespace import *
|
||||||
from . import Literal, _TripleMapType, bpo
|
from . import Literal, _TripleMapType, bpo
|
||||||
from ._katsudo import GbizInfoKatsudoMapper
|
from ._katsudo import GbizInfoKatsudoMapper
|
||||||
|
@ -6,6 +8,10 @@ from ._katsudo import GbizInfoKatsudoMapper
|
||||||
class GbizInfoHojyokinMapper(GbizInfoKatsudoMapper):
|
class GbizInfoHojyokinMapper(GbizInfoKatsudoMapper):
|
||||||
"""補助金情報"""
|
"""補助金情報"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def graph(self) -> URIRef:
|
||||||
|
return URIRef("http://hojin-info.go.jp/graph/hojyokin")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def map_to_triples(row: dict[str, str]) -> list[_TripleMapType]:
|
def map_to_triples(row: dict[str, str]) -> list[_TripleMapType]:
|
||||||
ss = HJ_EXT[f"{row['ID-識別値']}_{row['キー情報']}"]
|
ss = HJ_EXT[f"{row['ID-識別値']}_{row['キー情報']}"]
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
from rdflib import URIRef
|
||||||
|
|
||||||
from ..namespace import *
|
from ..namespace import *
|
||||||
from . import _TripleMapType
|
from . import _TripleMapType
|
||||||
from ._katsudo import GbizInfoKatsudoMapper
|
from ._katsudo import GbizInfoKatsudoMapper
|
||||||
|
@ -6,6 +8,10 @@ from ._katsudo import GbizInfoKatsudoMapper
|
||||||
class GbizInfoHyoshoMapper(GbizInfoKatsudoMapper):
|
class GbizInfoHyoshoMapper(GbizInfoKatsudoMapper):
|
||||||
"""表彰情報"""
|
"""表彰情報"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def graph(self) -> URIRef:
|
||||||
|
return URIRef("http://hojin-info.go.jp/graph/hyosho")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def map_to_triples(row: dict[str, str]) -> list[_TripleMapType]:
|
def map_to_triples(row: dict[str, str]) -> list[_TripleMapType]:
|
||||||
ss = HJ_EXT[f"{row['ID-識別値']}_{row['キー情報']}"]
|
ss = HJ_EXT[f"{row['ID-識別値']}_{row['キー情報']}"]
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
from rdflib import URIRef
|
||||||
|
|
||||||
from ..namespace import *
|
from ..namespace import *
|
||||||
from . import Literal, _TripleMapType, bpo
|
from . import Literal, _TripleMapType, bpo
|
||||||
from ._katsudo import GbizInfoKatsudoMapper
|
from ._katsudo import GbizInfoKatsudoMapper
|
||||||
|
@ -6,6 +8,10 @@ from ._katsudo import GbizInfoKatsudoMapper
|
||||||
class GbizInfoShokubaMapper(GbizInfoKatsudoMapper):
|
class GbizInfoShokubaMapper(GbizInfoKatsudoMapper):
|
||||||
"""職場情報"""
|
"""職場情報"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def graph(self) -> URIRef:
|
||||||
|
return URIRef("http://hojin-info.go.jp/graph/shokuba")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def map_to_triples(row: dict[str, str]) -> list[_TripleMapType]:
|
def map_to_triples(row: dict[str, str]) -> list[_TripleMapType]:
|
||||||
s = HJ_DATA[row["ID-識別値"]]
|
s = HJ_DATA[row["ID-識別値"]]
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
from rdflib import URIRef
|
||||||
|
|
||||||
from ..namespace import *
|
from ..namespace import *
|
||||||
from . import _TripleMapType
|
from . import _TripleMapType
|
||||||
from ._katsudo import GbizInfoKatsudoMapper
|
from ._katsudo import GbizInfoKatsudoMapper
|
||||||
|
@ -6,6 +8,10 @@ from ._katsudo import GbizInfoKatsudoMapper
|
||||||
class GbizInfoTodokedeMapper(GbizInfoKatsudoMapper):
|
class GbizInfoTodokedeMapper(GbizInfoKatsudoMapper):
|
||||||
"""届出認定情報"""
|
"""届出認定情報"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def graph(self) -> URIRef:
|
||||||
|
return URIRef("http://hojin-info.go.jp/graph/todokede")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def map_to_triples(row: dict[str, str]) -> list[_TripleMapType]:
|
def map_to_triples(row: dict[str, str]) -> list[_TripleMapType]:
|
||||||
ss = HJ_EXT[f"{row['ID-識別値']}_{row['キー情報']}"]
|
ss = HJ_EXT[f"{row['ID-識別値']}_{row['キー情報']}"]
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
from rdflib import URIRef
|
||||||
|
|
||||||
from ..namespace import *
|
from ..namespace import *
|
||||||
from . import _TripleMapType, bpo
|
from . import _TripleMapType, bpo
|
||||||
from ._katsudo import GbizInfoKatsudoMapper
|
from ._katsudo import GbizInfoKatsudoMapper
|
||||||
|
@ -6,6 +8,10 @@ from ._katsudo import GbizInfoKatsudoMapper
|
||||||
class GbizInfoTokkyoMapper(GbizInfoKatsudoMapper):
|
class GbizInfoTokkyoMapper(GbizInfoKatsudoMapper):
|
||||||
"""特許情報"""
|
"""特許情報"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def graph(self) -> URIRef:
|
||||||
|
return URIRef("http://hojin-info.go.jp/graph/tokkyo")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def map_to_triples(row: dict[str, str]) -> list[_TripleMapType]:
|
def map_to_triples(row: dict[str, str]) -> list[_TripleMapType]:
|
||||||
ss = HJ_EXT[f"{row['ID-識別値']}_{row['キー情報']}"]
|
ss = HJ_EXT[f"{row['ID-識別値']}_{row['キー情報']}"]
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
from rdflib import URIRef
|
||||||
|
|
||||||
from ..namespace import *
|
from ..namespace import *
|
||||||
from . import Literal, _TripleMapType, bpo
|
from . import Literal, _TripleMapType, bpo
|
||||||
from ._katsudo import GbizInfoKatsudoMapper
|
from ._katsudo import GbizInfoKatsudoMapper
|
||||||
|
@ -6,6 +8,10 @@ from ._katsudo import GbizInfoKatsudoMapper
|
||||||
class GbizInfoZaimuMapper(GbizInfoKatsudoMapper):
|
class GbizInfoZaimuMapper(GbizInfoKatsudoMapper):
|
||||||
"""財務情報"""
|
"""財務情報"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def graph(self) -> URIRef:
|
||||||
|
return URIRef("http://hojin-info.go.jp/graph/zaimu")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def map_to_triples(row: dict[str, str]) -> list[_TripleMapType]:
|
def map_to_triples(row: dict[str, str]) -> list[_TripleMapType]:
|
||||||
s = HJ_DATA[row["ID-識別値"]]
|
s = HJ_DATA[row["ID-識別値"]]
|
||||||
|
|
Loading…
Reference in a new issue