Support gzip compressed output

This commit is contained in:
Babibubebon 2024-06-19 01:29:40 +09:00
parent 18970dd8aa
commit 79df6f1a90
Signed by: Babibubebon
GPG key ID: 78C8FB2A2FEA1EE3
3 changed files with 21 additions and 7 deletions

View file

@ -1,4 +1,5 @@
import csv import csv
import gzip
import os import os
import shutil import shutil
import time import time
@ -127,11 +128,18 @@ MAPPER_TYPES = [
@click.option( @click.option(
"--format", "--format",
"-f", "-f",
"_format",
type=click.Choice([v.name for v in RDFFormatType]), type=click.Choice([v.name for v in RDFFormatType]),
default=RDFFormatType.nq.name, default=RDFFormatType.nq.name,
) )
@click.option("--compress", "-c", is_flag=True)
def convert( def convert(
work_dir: str, mappers: list[str], processes: int, output_dir: str, format: str work_dir: str,
mappers: list[str],
processes: int,
output_dir: str,
_format: str,
compress: bool,
): ):
if not mappers: if not mappers:
mappers = MAPPER_TYPES mappers = MAPPER_TYPES
@ -175,11 +183,15 @@ def convert(
case _: case _:
raise NotImplementedError raise NotImplementedError
output_file = os.path.join(output_dir, f"{m}.{format}") output_file = os.path.join(
output_dir, f"{m}.{_format}" + (".gz" if compress else "")
)
click.echo(f"output: {output_file}") click.echo(f"output: {output_file}")
click.echo(f"Running {m} mapper ...") click.echo(f"Running {m} mapper ...")
with open(output_file, "w") as f:
mapper.run(n_jobs=processes, output=f, format=RDFFormatType[format]) f = gzip.open(output_file, "wt") if compress else open(output_file, "w")
mapper.run(n_jobs=processes, output=f, format=RDFFormatType[_format])
f.close()
@cli.command(help="Fetch CSV data from OutputCSV endpoint") @cli.command(help="Fetch CSV data from OutputCSV endpoint")

View file

@ -1,16 +1,17 @@
import csv import csv
import sys import sys
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import IO, Iterator, Tuple, Union
from enum import Enum from enum import Enum
from typing import IO, Iterator, Tuple, Union
from joblib import Parallel, delayed from joblib import Parallel, delayed
from rdflib import BNode, URIRef from rdflib import BNode
from rdflib import Literal as LiteralRdflib from rdflib import Literal as LiteralRdflib
from rdflib import URIRef
from rdflib.graph import _ObjectType, _PredicateType, _SubjectType, _TripleType from rdflib.graph import _ObjectType, _PredicateType, _SubjectType, _TripleType
from rdflib.namespace import RDF from rdflib.namespace import RDF
from rdflib.plugins.serializers.nt import _nt_row
from rdflib.plugins.serializers.nquads import _nq_row from rdflib.plugins.serializers.nquads import _nq_row
from rdflib.plugins.serializers.nt import _nt_row
_TripleMapType = Tuple[ _TripleMapType = Tuple[
_SubjectType, _PredicateType, Union[str, _ObjectType, "BlankPredicateObjectMap"] _SubjectType, _PredicateType, Union[str, _ObjectType, "BlankPredicateObjectMap"]

View file

@ -1,4 +1,5 @@
from rdflib import URIRef from rdflib import URIRef
from ..namespace import * from ..namespace import *
from . import _TripleMapType from . import _TripleMapType
from ._katsudo import GbizInfoKatsudoMapper from ._katsudo import GbizInfoKatsudoMapper