Removal of sqlite/sqlalchemy based approach - it is too slow to combine the results, even with in-memory database loading.

2026-05-26 10:17:28 +01:00
parent 0479f1e4a8
commit e887cc791e
4 changed files with 2 additions and 1449 deletions
@@ -11,7 +11,8 @@ dependencies = [
    "pypdf (>=6.12.0,<7.0.0)",
    "pandas (>=3.0.3,<4.0.0)",
    "pandas-stubs (>=3.0.0.260204,<4.0.0.0)",
-    "sqlalchemy (>=2.0.49,<3.0.0)"
+    "sqlalchemy (>=2.0.49,<3.0.0)",
    "py-spy (>=0.4.2,<0.5.0)"
 ]
@@ -1,279 +0,0 @@
 """
 Queries for the 'raw_mca_...' tables generated in parsing.py.
 Thus far, attempts at few assumptions for record types have been made.
 These queries will outright expect certain properties of the databases generated.
 If they suddenly stop working - it should be due to an RSP specification change.
 Therefore, the error handling will be less graceful as this is not predictable.
 It seems unlikely that the CIF format will be modified any time soon.
 """
 # Imports
 from collections.abc import Iterable
 import sqlite3
 from datetime import datetime
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any
 import numpy as np
 import pandas as pd
 from sqlalchemy import (
    Column,
    Engine,
    MetaData,
    Select,
    Table,
    create_engine,
    select,
 )
 from national_rail_timetable.mca_record_types import (
    BS,
    BX,
    CR,
    LI,
    LO,
    LT,
    BaseRecord,
 )
 from national_rail_timetable.parsing import validate_db_path
 # Classes
 class SixDate(str):
    def __post_init__(self):
        assert len(self) == 6
        assert self.isnumeric()
    def as_unix(self):
        return f"20{self[:2]}-{self[2:4]}-{self[-2:]}"
    def as_numpy(self):
        return np.datetime64(self.as_unix())
    def as_datetime(self):
        return datetime.fromisoformat(self.as_unix())
    @classmethod
    def from_unix(cls, unix_str: str):
        return cls(unix_str[2:].replace("-", "").replace("/", ""))
    @classmethod
    def from_numpy(cls, np_dt: np.datetime64):
        return cls.from_unix(str(np_dt).split("T")[0])
    @classmethod
    def from_datetime(cls, dt: datetime):
        return cls.from_unix(str(dt).split(" ")[0])
    @property
    def weekday(self) -> int:
        return self.as_datetime().weekday()
    @property
    def weekday_like(self) -> str:
        return "_" * self.weekday + "1%"
@dataclass
 class Schedule:
    sn: int
    bs: pd.Series
    bx: pd.Series
    loit: pd.DataFrame
    cr: pd.DataFrame
@dataclass
 class Service(Schedule):
    date: SixDate
@dataclass
 class Timetable:
    engine: Engine = field(
        default_factory=lambda: create_engine(
            url=f"sqlite:///{validate_db_path().as_posix()}"
        )
    )
    metadata: MetaData = field(default_factory=MetaData)
    tables: dict[str, Table] = field(default_factory=dict)
    def __post_init__(self):
        self._populate_self_tables()
    def _populate_self_tables(self):
        cursor = (connection := self._generate_sqlite3_connection()).cursor()
        self.tables |= {
            name: Table(
                name,  # pyright: ignore[reportAny]
                self.metadata,
                *[
                    Column(d[0])
                    for d in cursor.execute(f"SELECT * FROM {name} LIMIT 1").description
                ],
            )
            for name in [  # pyright: ignore[reportAny]
                row[0]
                for row in cursor.execute(  # pyright: ignore[reportAny]
                    "SELECT name FROM sqlite_master WHERE type = 'table'"
                ).fetchall()
                if row[0][:8] == "raw_mca_"
            ]
        }
        connection.close()
    def _generate_sqlite3_connection(self) -> sqlite3.Connection:
        return sqlite3.connect(self.engine.url.__to_string__().split("///")[1])
    #  TODO: Implement docstrings from 'spec_mca_...' tables
    def _hardcode_table_dataclasses(self):
        text: str = (
            "# This file is pre-generated for type-hinting while writing MCA file queries. \n"
            + "# Any changes made manually will likely be overwritten. \n"
            + "# It should not need to be generated more than once. \n"
            + "# If the RSP's timetable specification changes, then this will need to be updated. \n"
            + "# Result of mca_queries.py's Timetable._hardcode_table_dataclasses. \n"
            + "\n"
            + "# Imports \n"
            + "from dataclasses import dataclass, field \n"
            + "from typing import Any \n"
            + "from sqlalchemy import Column, MetaData, Table, String, Integer \n"
            + "\n"
            + "# Init. \n"
            + "metadata = MetaData() \n"
            + "\n"
            + "# Classes \n"
            + "@dataclass \n"
            + "class BaseRecord: \n"
            + "\tall: Table \n\n"
            + "\t@property \n"
            + "\tdef schedule_number(self) -> Column[Integer]: ... \n\n"
            + "\t@property \n"
            + "\tdef line_number(self) -> Column[Integer]: ... \n\n"
        )
        for name in self.tables:
            columns = [column.name for column in self.tables[name].columns]
            text += (
                f"_{(rr := name.split('_')[-1].upper())}_columns = Table( \n"
                + f"\t'{name}', \n"
                + "\tmetadata, \n"
                + "".join([f"\tColumn('{column}'), \n" for column in columns])
                + ") \n\n"
            )
            text += (
                "@dataclass \n"
                + f"class _{rr}_base(BaseRecord): \n"
                + "\tall: Table \n\n"
                + "".join(
                    [
                        (
                            "\t@property \n"
                            + f"\tdef {column}(self) -> "
                            + f"Column[{'String' if column not in ['line_number', 'schedule_number'] else 'Integer'}]: \n"
                            + f"\t\treturn self.all.c.{column} \n\n"
                        )
                        for column in columns
                    ]
                )
            )
            text += f"{rr} = _{rr}_base(_{rr}_columns) \n\n"
        path = Path(__file__).parent / "mca_record_types.py"
        with open(path, "w") as wf:
            _ = wf.write(text.replace("\t", "    "))
    @classmethod
    def from_db_path(cls, db_path: Path | None = None):
        db_path = validate_db_path(db_path)
        return cls(engine=create_engine(url=f"sqlite:///{db_path.as_posix()}"))
    @classmethod
    def default(cls):
        return cls.from_db_path()
    def execute(self, query: Select[Any]) -> pd.DataFrame:  # pyright: ignore[reportExplicitAny]
        with self.engine.connect() as connection:
            return pd.read_sql(query, connection)
    def _fetch_records_of_schedules(
        self,
        record_type: BaseRecord,
        *schedule_numbers: int,
    ) -> pd.DataFrame:
        return self.execute(
            select(record_type.all).where(
                record_type.schedule_number.in_(schedule_numbers)
            )
        )
    def fetch_schedules(self, *schedule_numbers: int) -> dict[int, Schedule]:
        bs = self._fetch_records_of_schedules(BS, *schedule_numbers)
        bx = self._fetch_records_of_schedules(BX, *schedule_numbers)
        lo = self._fetch_records_of_schedules(LO, *schedule_numbers)
        li = self._fetch_records_of_schedules(LI, *schedule_numbers)
        lt = self._fetch_records_of_schedules(LT, *schedule_numbers)
        cr = self._fetch_records_of_schedules(CR, *schedule_numbers)
        return {
            sn: Schedule(
                sn=sn,
                bs=bs[bs.schedule_number == sn].iloc[0],
                bx=bx[bx.schedule_number == sn].iloc[0],
                loit=pd.concat(
                    [
                        lo[lo.schedule_number == sn],
                        li[li.schedule_number == sn],
                        lt[lt.schedule_number == sn],
                    ]
                ).reset_index(drop=True),
                cr=cr[cr.schedule_number == sn],
            )
            for sn in schedule_numbers
        }
 # Functions
 def services_date_and_tiploc(
    date: SixDate,
    tiploc: str,
    tt: Timetable | None = None,
 ):
    tt = tt if tt is not None else Timetable()
    on_date: Iterable[int] = tt.execute(
        select(BS.schedule_number).where(
            (BS.date_runs_from <= date)
            & (BS.date_runs_to >= date)
            & (BS.days_run.like(date.weekday_like))
        )
    ).schedule_number.values
    origin: Iterable[int] = tt.execute(
        select(LO.schedule_number).where(LO.location == f"{tiploc:<8}")
    ).schedule_number.values
    en_route: Iterable[int] = tt.execute(
        select(LI.schedule_number).where(
            (LI.location == f"{tiploc:<8}")
            & (LI.scheduled_departure_time.not_like("  %"))
        )
    ).schedule_number.values
    destination: Iterable[int] = tt.execute(
        select(LT.schedule_number).where(LT.location == f"{tiploc:<8}")
    ).schedule_number.values
    sns = np.unique([*origin, *en_route, *destination])
    sns = np.intersect1d(sns, on_date)  # pyright: ignore[reportCallIssue, reportUnknownVariableType, reportArgumentType]
    sns = [int(sn) for sn in sns]  # pyright: ignore[reportUnknownVariableType, reportUnknownArgumentType]
    return [
        Service(
            date=date,
            **schedule.__dict__,  # pyright: ignore[reportAny]
        )
        for _, schedule in tt.fetch_schedules(*sns).items()
    ]
 # Script
 def main():
    print(s := services_date_and_tiploc(SixDate("260525"), "YORK"))
    return len(s)
 if __name__ == "__main__":
    print(main())
@@ -9,17 +9,13 @@ Aimed primarily towards producing a reduced sqlite database.
 # pyright: reportUnknownLambdaType=false
 # Imports
 import os
 import sqlite3
 from itertools import pairwise
 from pathlib import Path
 from zipfile import ZipFile
 import numpy as np
 import pandas as pd
 from pypdf import PageObject, PdfReader
 from national_rail_timetable.nr_requests import fetch_nr_timetable_files
 # Init.
 SPECIFICATION_TABLE_LOCATIONS = {
@@ -37,7 +33,6 @@ SPECIFICATION_TABLE_LOCATIONS = {
    "MCA_ZZ": (23, 1),
 }
 DEFAULT_RAW_SPEC_DATA_DIR = Path(__file__).parents[2] / "data/specification_tables"
 DEFAULT_DB_PATH = Path.home() / ".cache/nr_data/timetable.db"
 # Functions
@@ -151,165 +146,3 @@ def read_specification_table_raws(
            continue
        tables[path.name[:-4]] = pd.read_csv(path)
    return tables
 def validate_db_path(db_path: Path | None = None):
    db_path = (
        db_path
        if db_path is not None
        else Path(os.environ.get("NR_DATADIR", DEFAULT_DB_PATH))
    )
    db_path.parent.mkdir(exist_ok=True, parents=True)
    return db_path
 def create_mca_specification_dbtables(
    tables: dict[str, pd.DataFrame],
    db_path: Path | None = None,
 ):
    db_path = validate_db_path(db_path)
    connection = sqlite3.connect(db_path)
    cursor = connection.cursor()
    for name, df in tables.items():
        if (_n := name.split("_"))[0] != "MCA" or len(_n) != 2:
            continue
        df["Start Index"] = df["Position"].apply(lambda s: int(s.split("-")[0]) - 1)
        df["End Index"] = df["Position"].apply(lambda s: int(s.split("-")[-1]))
        _ = cursor.execute(f"DROP TABLE IF EXISTS spec_{name.lower()}")
        _ = cursor.execute(
            f"""
                CREATE TABLE spec_{name.lower()}
                ({", ".join([col.lower().replace(" ", "_") for col in df.columns])})
            """,
        )
        _ = cursor.executemany(
            f"""
                INSERT INTO spec_{name.lower()}
                VALUES({", ".join(["?" for _ in df.columns])})
            """,
            [tuple(row.values) for _, row in df.iterrows()],
        )
        connection.commit()
    connection.close()
    return db_path
 #  TODO: There is no need for this to take minutes, investigate speed ups.
 def create_mca_raw_dbtables(
    zipfile: ZipFile | None = None,
    db_path: Path | None = None,
    allow_fetch: bool = True,
    print_progress: bool = True,
 ) -> dict[str, str]:
    db_path = validate_db_path(db_path)
    if zipfile is None:
        if allow_fetch:
            zipfile = fetch_nr_timetable_files()
        else:
            raise RuntimeError(
                "There was no zipfile provided and allow_fetch is set to False. "
                + "Please either allow automatic fetching, or supply the zipfile argument. "
                + "The package's fetching function is fetch_nr_timetable_files. "
            )
    connection = sqlite3.connect(db_path)
    cursor = connection.cursor()
    tables = [
        row[0]
        for row in cursor.execute(
            "SELECT name FROM sqlite_master WHERE type = 'table'"
        ).fetchall()
        if row[0][:9] == "spec_mca_"
    ]
    if len(tables) == 0:
        raise FileNotFoundError(
            "No spec_mca_... tables found in given database. "
            + "Please ensure create_mca_specification_dbtables has been run successfully. "
        )
    mappings = {}
    all_start_indexes = {}
    all_end_indexes = {}
    for name in tables:
        mappings[name] = "raw_" + name[5:]
        spec = pd.DataFrame(
            cursor.execute(f"SELECT * FROM {name}").fetchall(),
            columns=[d[0] for d in cursor.description],
        )
        all_start_indexes[mappings[name]] = spec.start_index.values
        all_end_indexes[mappings[name]] = spec.end_index.values
        new_columns = [
            col.split("/")[0].lower().replace(" ", "_").replace("-", "_")
            for col in spec.field_description
        ] + ["line_number", "schedule_number"]
        _ = cursor.execute(f"DROP TABLE IF EXISTS {mappings[name].lower()}")
        _ = cursor.execute(
            f"""
                CREATE TABLE {mappings[name]}
                ({(", ".join(new_columns))})
            """,
        )
        connection.commit()
    zipinfo = [
        zipinfo
        for name, zipinfo in zipfile.NameToInfo.items()
        if name.split(".")[-1] == "MCA"
    ][0]
    file = zipfile.open(zipinfo)
    schedule_number, line_number = -1, -1
    while (line := file.readline().decode()) != "":
        line_number += 1
        record_type = line[:2]
        schedule_number += int(record_type == "BS")
        target_table = f"raw_mca_{record_type.lower()}"
        start_indexes = all_start_indexes.get(target_table)
        end_indexes = all_end_indexes.get(target_table)
        if start_indexes is None or end_indexes is None:
            continue
        values = ", ".join(
            [
                "'" + line[lb:ub].replace("'", "") + "'"
                for lb, ub in zip(start_indexes, end_indexes, strict=True)
            ]
            + [f"{line_number}", f"{schedule_number}"]
        )
        _ = cursor.execute(f"INSERT INTO {target_table} VALUES({values})")
        if line_number % 3737 == 0 and print_progress:
            print(f" {line_number:9,} {line[:-1]}", end="\r")
    if print_progress:
        print()
    connection.commit()
    connection.close()
    return mappings
 # Script
 def main(
    skip_pdf: bool = False,
    pdf_spec_path: Path | None = None,
    raw_spec_dir: Path | None = None,
 ):
    if not skip_pdf:
        try:
            tables = extract_specification_document_tables(pdf_spec_path)
            _ = store_specification_table_raws(tables, raw_spec_dir)
        except FileNotFoundError:
            pass
    try:
        tables = read_specification_table_raws(raw_spec_dir)
    except FileNotFoundError:
        raise FileNotFoundError(
            "The tables generated from the RSP's specification were not found. "
            + "This means neither the cached version nor the original .pdf is available. "
            + "Try suppling either to their default locations, or supplying custom directories. "
            + "Manual fix: extract_specification_document_tables then store_specification_table_raws. "
        )
    _ = create_mca_specification_dbtables(tables)
    return create_mca_raw_dbtables()
 if __name__ == "__main__":
    _ = main()