Finished parsing.py initial implementation, now have a sqlite database generating >600MB of timetable records. Next will be generating sqlalchemy desriptors based on the automated specifications. If I can re-learn sqlalchemy that is.

2026-05-22 16:57:14 +01:00
parent d63f151c9b
commit f35cda6f10
3 changed files with 108 additions and 20 deletions
@@ -1,10 +1,3 @@
-from national_rail_timetable.nr_requests import fetch_nr_token, fetch_nr_timetable_files
+from national_rail_timetable.parsing import main
 from national_rail_timetable.parsing import (
    extract_specification_document_tables,
    store_specification_table_raws,
    read_specification_table_raws,
    create_mca_specification_dbschema,
    main,
 )
 print(main())
@@ -65,7 +65,7 @@ def fetch_nr_token(
 def fetch_nr_timetable_files(
    config: NRConfig | None = None,  # pyright: ignore[reportRedeclaration]
    token: str | None = None,  # pyright: ignore[reportRedeclaration]
-    attempts: int = 1,
+    attempts: int = 3,
 ) -> ZipFile:
    config: NRConfig = config if config is not None else NRConfig.from_env()
    token: str = (
@@ -9,14 +9,17 @@ Aimed primarily towards producing a reduced sqlite database.
 # pyright: reportUnknownLambdaType=false
 # Imports
 import os
 import sqlite3
 from itertools import pairwise
 from pathlib import Path
-import pandas as pd
+from zipfile import ZipFile
 import numpy as np
-import sqlite3
+import pandas as pd
 import os
 from pypdf import PageObject, PdfReader
 from national_rail_timetable.nr_requests import fetch_nr_timetable_files
 # Init.
 SPECIFICATION_TABLE_LOCATIONS = {
@@ -34,6 +37,7 @@ SPECIFICATION_TABLE_LOCATIONS = {
    "MCA_ZZ": (23, 1),
 }
 DEFAULT_RAW_SPEC_DATA_DIR = Path(__file__).parents[2] / "data/specification_tables"
 DEFAULT_DB_PATH = Path.home() / ".cache/nr_data/timetable.db"
 # Functions
@@ -149,16 +153,21 @@ def read_specification_table_raws(
    return tables
-def create_mca_specification_dbschema(
+def _validate_db_path(db_path: Path | None = None):
    tables: dict[str, pd.DataFrame],
    db_path: Path | None = None,
 ):
    db_path = (
        db_path
        if db_path is not None
-        else Path(os.environ.get("NR_DATADIR", "~/.cache/nr_data/timetable.db"))
+        else Path(os.environ.get("NR_DATADIR", DEFAULT_DB_PATH))
    )
    db_path.parent.mkdir(exist_ok=True, parents=True)
    return db_path
 def create_mca_specification_dbtables(
    tables: dict[str, pd.DataFrame],
    db_path: Path | None = None,
 ):
    db_path = _validate_db_path(db_path)
    connection = sqlite3.connect(db_path)
    cursor = connection.cursor()
    for name, df in tables.items():
@@ -179,13 +188,98 @@ def create_mca_specification_dbschema(
                INSERT INTO spec_{name.lower()}
                VALUES({", ".join(["?" for _ in df.columns])})
            """,
-            [list(row.values) for _, row in df.iterrows()],
+            [tuple(row.values) for _, row in df.iterrows()],
        )
        connection.commit()
    connection.close()
    return db_path
 #  TODO: There is no need for this to take minutes, investigate speed ups.
 def create_mca_raw_dbtables(
    zipfile: ZipFile | None = None,
    db_path: Path | None = None,
    allow_fetch: bool = True,
 ) -> dict[str, str]:
    db_path = _validate_db_path(db_path)
    if zipfile is None:
        if allow_fetch:
            zipfile = fetch_nr_timetable_files()
        else:
            raise RuntimeError(
                "There was no zipfile provided and allow_fetch is set to False. "
                + "Please either allow automatic fetching, or supply the zipfile argument. "
                + "The package's fetching function is fetch_nr_timetable_files. "
            )
    connection = sqlite3.connect(db_path)
    cursor = connection.cursor()
    tables = [
        row[0]
        for row in cursor.execute(
            "SELECT name FROM sqlite_master WHERE type = 'table'"
        ).fetchall()
        if row[0][:9] == "spec_mca_"
    ]
    if len(tables) == 0:
        raise FileNotFoundError(
            "No spec_mca_... tables found in given database. "
            + "Please ensure create_mca_specification_dbtables has been run successfully. "
        )
    mappings = {}
    all_start_indexes = {}
    all_end_indexes = {}
    for name in tables:
        mappings[name] = "raw_" + name[5:]
        spec = pd.DataFrame(
            cursor.execute(f"SELECT * FROM {name}").fetchall(),
            columns=[d[0] for d in cursor.description],
        )
        all_start_indexes[mappings[name]] = spec.start_index.values
        all_end_indexes[mappings[name]] = spec.end_index.values
        new_columns = [
            col.split("/")[0].lower().replace(" ", "_").replace("-", "_")
            for col in spec.field_description
        ]
        _ = cursor.execute(f"DROP TABLE IF EXISTS {mappings[name].lower()}")
        _ = cursor.execute(
            f"""
                CREATE TABLE {mappings[name]}
                ({(", ".join(new_columns))})
            """,
        )
        connection.commit()
    zipinfo = [
        zipinfo
        for name, zipinfo in zipfile.NameToInfo.items()
        if name.split(".")[-1] == "MCA"
    ][0]
    file = zipfile.open(zipinfo)
    counter = 0
    while (line := file.readline().decode()) != "":
        record_type = line[:2]
        target_table = f"raw_mca_{record_type.lower()}"
        start_indexes = all_start_indexes.get(target_table)
        end_indexes = all_end_indexes.get(target_table)
        if start_indexes is None or end_indexes is None:
            continue
        values = ", ".join(
            [
                "'" + line[lb:ub].replace("'", "") + "'"
                for lb, ub in zip(start_indexes, end_indexes, strict=True)
            ]
        )
        _ = cursor.execute(f"INSERT INTO {target_table} VALUES({values})")
        counter += 1
        if counter % 1111 == 0:
            print(f" {counter:,}", end="\r")
    print()
    connection.commit()
    connection.close()
    return mappings
 # Script
 def main(
    skip_pdf: bool = False,
@@ -209,8 +303,9 @@ def main(
            + "Manual fix: extract_specification_document_tables then store_specification_table_raws. "
        )
-    _ = create_mca_specification_dbschema(tables)
+    _ = create_mca_specification_dbtables(tables)
    return create_mca_raw_dbtables()
 if __name__ == "__main__":
-    main()
+    _ = main()