Source code for ferc_xbrl_extractor.cli

"""A command line interface (CLI) to the xbrl extractor."""

import argparse
import io
import logging
import sys
from pathlib import Path

import coloredlogs
from sqlalchemy import create_engine

from ferc_xbrl_extractor import helpers, xbrl
from ferc_xbrl_extractor.helpers import get_logger

[docs] TAXONOMY_MAP = { 1: "https://eCollection.ferc.gov/taxonomy/form1/2022-01-01/form/form1/form-1_2022-01-01.xsd", 2: "https://eCollection.ferc.gov/taxonomy/form2/2022-01-01/form/form2/form-2_2022-01-01.xsd", 6: "https://eCollection.ferc.gov/taxonomy/form6/2022-01-01/form/form6/form-6_2022-01-01.xsd", 60: "https://eCollection.ferc.gov/taxonomy/form60/2022-01-01/form/form60/form-60_2022-01-01.xsd", 714: "https://eCollection.ferc.gov/taxonomy/form714/2022-01-01/form/form714/form-714_2022-01-01.xsd", }
[docs] def parse(): """Process base commands from the CLI.""" parser = argparse.ArgumentParser(description="Extract data from XBRL filings") parser.add_argument( "instance_path", help="Path to a single xbrl filing, or a directory of xbrl filings", type=Path, ) parser.add_argument( "sql_path", help="Store data in sqlite database specified in argument" ) parser.add_argument( "-s", "--datapackage-path", default=None, type=Path, help="Generate frictionless datapackage descriptor, and write to JSON file at specified path.", ) parser.add_argument( "-c", "--clobber", action="store_true", default=False, help="Clobber existing outputs if they exist", ) parser.add_argument( "-b", "--batch-size", default=None, type=int, help="Specify number of instances to be processed at a time(defaults to one large batch)", ) parser.add_argument( "-w", "--workers", default=None, type=int, help="Specify number of workers in pool (will attempt to choose a reasonable default if not specified)", ) parser.add_argument( "-t", "--taxonomy", default=None, help="Specify taxonomy used create structure of final database", ) parser.add_argument( "-f", "--form-number", default=1, type=int, help="Specify form number to choose taxonomy used to generate output schema (if a taxonomy is explicitly specified that will override this parameter). Form number is also used for setting the name of the datapackage descriptor if requested.", ) parser.add_argument( "-a", "--entry-point", default=None, type=Path, help="Specify path to taxonomy entry point within a zipfile archive. This is a relative path within the taxonomy. If specified, `taxonomy` must be set to point to the zipfile location on the local file system.", ) parser.add_argument( "-m", "--metadata-path", default=None, type=Path, help="Specify path to output metadata extracted taxonomy. Metadata will not be extracted if no path is specified.", ) parser.add_argument( "--loglevel", help="Set log level (valid arguments include DEBUG, INFO, WARNING, ERROR, CRITICAL)", default="INFO", ) parser.add_argument("--logfile", help="Path to logfile", type=Path, default=None) parser.add_argument( "--instance-pattern", help="Regex pattern for filing name - if not provided, defaults to '' which matches all.", default=r"", ) parser.add_argument( "--requested-tables", help="Table names to extract - if none, will default to all. Includes the _duration/_instant suffix.", nargs="+", default=None, ) return parser.parse_args()
[docs] def run_main( instance_path: Path | io.BytesIO, sql_path: Path, clobber: bool, taxonomy: Path | io.BytesIO | None, entry_point: Path, form_number: int | None, metadata_path: Path | None, datapackage_path: Path | None, workers: int | None, batch_size: int | None, loglevel: str, logfile: Path | None, requested_tables: list[str] | None = None, instance_pattern: str = r"", ): """Log setup, taxonomy finding, and SQL IO.""" logger = get_logger("ferc_xbrl_extractor") logger.setLevel(loglevel) log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s" coloredlogs.install(fmt=log_format, level=loglevel, logger=logger) if logfile: file_logger = logging.FileHandler(logfile) file_logger.setFormatter(logging.Formatter(log_format)) logger.addHandler(file_logger) db_uri = f"sqlite:///{sql_path}" engine = create_engine(db_uri) if clobber: helpers.drop_tables(engine) # Verify taxonomy is set if archive_path is set if entry_point and not taxonomy: raise ValueError("taxonomy must be set if archive_path is given.") # Get taxonomy URL if taxonomy is None: if form_number not in TAXONOMY_MAP: raise ValueError( f"Form number {form_number} is not valid. Supported form numbers include {list(TAXONOMY_MAP.keys())}" ) # Get most recent taxonomy for specified form number taxonomy = TAXONOMY_MAP[form_number] extracted = xbrl.extract( taxonomy_source=taxonomy, form_number=form_number, db_uri=db_uri, entry_point=entry_point, datapackage_path=datapackage_path, metadata_path=metadata_path, instance_path=instance_path, workers=workers, batch_size=batch_size, requested_tables=requested_tables, instance_pattern=instance_pattern, ) with engine.begin() as conn: for table_name, data in extracted.table_data.items(): # Loop through tables and write to database if not data.empty: data.to_sql(table_name, conn, if_exists="append")
[docs] def main(): """Parse arguments and pass to run_main.""" return run_main(**vars(parse()))
if __name__ == "__main__": sys.exit(main())