diff --git a/gbizinfo_lod/command.py b/gbizinfo_lod/command.py index 2bbd39c..49f3e43 100644 --- a/gbizinfo_lod/command.py +++ b/gbizinfo_lod/command.py @@ -1,4 +1,5 @@ import csv +import gzip import os import shutil import time @@ -127,11 +128,18 @@ MAPPER_TYPES = [ @click.option( "--format", "-f", + "_format", type=click.Choice([v.name for v in RDFFormatType]), default=RDFFormatType.nq.name, ) +@click.option("--compress", "-c", is_flag=True) def convert( - work_dir: str, mappers: list[str], processes: int, output_dir: str, format: str + work_dir: str, + mappers: list[str], + processes: int, + output_dir: str, + _format: str, + compress: bool, ): if not mappers: mappers = MAPPER_TYPES @@ -175,11 +183,15 @@ def convert( case _: raise NotImplementedError - output_file = os.path.join(output_dir, f"{m}.{format}") + output_file = os.path.join( + output_dir, f"{m}.{_format}" + (".gz" if compress else "") + ) click.echo(f"output: {output_file}") click.echo(f"Running {m} mapper ...") - with open(output_file, "w") as f: - mapper.run(n_jobs=processes, output=f, format=RDFFormatType[format]) + + f = gzip.open(output_file, "wt") if compress else open(output_file, "w") + mapper.run(n_jobs=processes, output=f, format=RDFFormatType[_format]) + f.close() @cli.command(help="Fetch CSV data from OutputCSV endpoint") diff --git a/gbizinfo_lod/mappers/__init__.py b/gbizinfo_lod/mappers/__init__.py index d8c49f2..41925d7 100644 --- a/gbizinfo_lod/mappers/__init__.py +++ b/gbizinfo_lod/mappers/__init__.py @@ -1,16 +1,17 @@ import csv import sys from abc import ABC, abstractmethod -from typing import IO, Iterator, Tuple, Union from enum import Enum +from typing import IO, Iterator, Tuple, Union from joblib import Parallel, delayed -from rdflib import BNode, URIRef +from rdflib import BNode from rdflib import Literal as LiteralRdflib +from rdflib import URIRef from rdflib.graph import _ObjectType, _PredicateType, _SubjectType, _TripleType from rdflib.namespace import RDF -from rdflib.plugins.serializers.nt import _nt_row from rdflib.plugins.serializers.nquads import _nq_row +from rdflib.plugins.serializers.nt import _nt_row _TripleMapType = Tuple[ _SubjectType, _PredicateType, Union[str, _ObjectType, "BlankPredicateObjectMap"] diff --git a/gbizinfo_lod/mappers/chotatsu.py b/gbizinfo_lod/mappers/chotatsu.py index af103cc..9666f95 100644 --- a/gbizinfo_lod/mappers/chotatsu.py +++ b/gbizinfo_lod/mappers/chotatsu.py @@ -1,4 +1,5 @@ from rdflib import URIRef + from ..namespace import * from . import _TripleMapType from ._katsudo import GbizInfoKatsudoMapper