Tracked mca_querying.py.

Progress towards sensible-enough numpy based querying using shared memory views rather than duplicated data.
Created mca_stubs.py which provides record entry details (and slice objects) for the .MCA types.
2026-05-26 17:33:25 +01:00 · 2026-05-26 17:32:56 +01:00 · 2026-05-26 11:06:28 +01:00 · 2026-05-26 10:17:28 +01:00 · 2026-05-25 21:21:53 +01:00 · 2026-05-25 17:46:13 +01:00
6 changed files with 859 additions and 69 deletions
@@ -10,7 +10,9 @@ dependencies = [
    "requests (>=2.34.2,<3.0.0)",
    "pypdf (>=6.12.0,<7.0.0)",
    "pandas (>=3.0.3,<4.0.0)",
-    "pandas-stubs (>=3.0.0.260204,<4.0.0.0)"
+    "pandas-stubs (>=3.0.0.260204,<4.0.0.0)",
+    "sqlalchemy (>=2.0.49,<3.0.0)",
+    "py-spy (>=0.4.2,<0.5.0)"
 ]


@@ -1,10 +1,3 @@
-from national_rail_timetable.nr_requests import fetch_nr_token, fetch_nr_timetable_files
-from national_rail_timetable.parsing import (
-    extract_specification_document_tables,
-    store_specification_table_raws,
-    read_specification_table_raws,
-    create_mca_specification_dbschema,
-    main,
-)
+from national_rail_timetable.mca_querying import main

 print(main())
@@ -0,0 +1,191 @@
+"""
+MCA file querying to retrieve schedule(d service)s.
+"""
+# pyright: reportAny=false
+# pyright: reportUnknownVariableType=false
+# pyright: reportUnknownArgumentType=false
+# pyright: reportAttributeAccessIssue=false
+# pyright: reportOperatorIssue=false
+
+# Imports
+from dataclasses import dataclass
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import Callable, Self
+from zipfile import ZipFile
+import numpy as np
+from numpy.typing import NDArray
+from national_rail_timetable.nr_requests import fetch_nr_timetable_files
+from national_rail_timetable.mca_stubs import BS, BX, LI, LO, LT, CR
+
+
+# Functions
+def ux(s1: NDArray[np.byte]) -> NDArray[np.str_]:
+    x = s1.shape[1]
+    s1.dtype = f"S{x}"
+    return s1.astype(f"U{x}")[:, 0]
+
+
+def sx(s1: NDArray[np.byte]) -> NDArray[np.byte]:
+    x = s1.shape[1]
+    s1.dtype = f"S{x}"
+    return s1[:, 0]
+
+
+# Classes
+@dataclass
+class Timetable:
+    array: NDArray[np.byte]
+
+    @classmethod
+    def from_zipfile(
+        cls,
+        zipfile: ZipFile | None = None,
+    ) -> Self:
+        zipfile = zipfile if zipfile is not None else fetch_nr_timetable_files()
+        with TemporaryDirectory() as tempdir:
+            name = zipfile.extract(
+                [
+                    zipinfo
+                    for name, zipinfo in zipfile.NameToInfo.items()
+                    if name.split(".")[-1] == "MCA"
+                ][0],
+                path=tempdir,
+            )
+            array: NDArray[np.byte] = np.fromfile(
+                Path(tempdir) / name, dtype="S1"
+            ).reshape((-1, 82))[:, :-2]
+        return cls(array=array)
+
+    def bs_mask(self) -> NDArray[np.bool]:
+        return (self.array[:, 0] == b"B") & (self.array[:, 1] == b"S")
+
+    def lo_mask(self) -> NDArray[np.bool]:
+        return (self.array[:, 0] == b"L") & (self.array[:, 1] == b"O")
+
+    @property
+    def sns(self) -> NDArray[np.integer]:
+        return np.repeat(
+            (
+                sns := [
+                    0,
+                    *np.arange(self.array.shape[0])[self.bs_mask()],
+                    self.array.shape[0],
+                ]
+            )[:-1],
+            np.diff(sns),
+        )
+
+    def fetch_schedules(self, *sns: int) -> NDArray[np.byte]:
+        return self.array[np.isin(self.sns, sns)]
+
+
+@dataclass
+class Query:
+    tt: Timetable
+    sns: NDArray[np.integer] | None = None
+
+    def _query_from_mask(self, mask: NDArray[np.bool]) -> Query:
+        sns: NDArray[np.integer] = self.tt.sns[mask]
+        if self.sns is not None:
+            sns = np.intersect1d(sns, self.sns)
+        return Query(self.tt, sns)
+
+    def on_date(self, date: str) -> Query:
+        mask: NDArray[np.bool] = (
+            (sx(self.tt.array[:, LO().record_identity()]) == b"BS")
+            & (sx(self.tt.array[:, BS().date_runs_from()]) <= date.encode())
+            & (sx(self.tt.array[:, BS().date_runs_to()]) >= date.encode())
+        )
+        return self._query_from_mask(mask)
+
+    def origin(self, tiploc: str) -> Query:
+        mask: NDArray[np.bool] = (
+            (sx(self.tt.array[:, LO().record_identity()]) == b"LO")
+            & (
+                sx(self.tt.array[:, LO().location().start : LO().location().stop - 1])
+                == f"{tiploc:<7}".encode()
+            ),
+        )[0]
+        return self._query_from_mask(mask)
+
+    def dest(self, tiploc: str, call_number: str | int | None = None) -> Query:
+        mask: NDArray[np.bool] = (
+            (sx(self.tt.array[:, LO().record_identity()]) == b"LT")
+            & (
+                sx(self.tt.array[:, LT().location().start : LT().location().stop - 1])
+                == f"{tiploc:<7}".encode()
+            ),
+        )[0]
+        if call_number is not None:
+            mask &= (
+                sx(self.tt.array[:, LT().location().stop]) == str(call_number).encode()
+            )
+        return self._query_from_mask(mask)
+
+    def calls(self, tiploc: str, call_number: str | int | None = None) -> Query:
+        mask: NDArray[np.bool] = (
+            (sx(self.tt.array[:, LI().record_identity()]) == b"LI")
+            & (
+                sx(self.tt.array[:, LI().location().start : LI().location().stop - 1])
+                == f"{tiploc:<7}".encode()
+            ),
+        )[0]
+        if call_number is not None:
+            mask &= (
+                sx(self.tt.array[:, LI().location().stop]) == str(call_number).encode()
+            )
+        return self._query_from_mask(mask) | self.origin(tiploc) | self.dest(tiploc)
+
+    def get_field(self, mca_field: Callable[..., slice]):
+        record_type = str(mca_field).split("method ")[1].split(".")[0]
+        return sx(
+            self.result[sx(self.result[:, :2]) == record_type.encode(), mca_field()]
+        )
+
+    @property
+    def _a(self):
+        return self.tt.array
+
+    @property
+    def result(self) -> NDArray[np.byte]:
+        assert self.sns is not None
+        return self.tt.fetch_schedules(*self.sns)
+
+    def as_sx(self) -> NDArray[np.byte]:
+        return sx(self.result)
+
+    def as_ux(self) -> NDArray[np.str_]:
+        return ux(self.result)
+
+    def __and__(self, other: Query) -> Query:
+        assert self.tt is other.tt
+        assert self.sns is not None
+        assert other.sns is not None
+        return Query(self.tt, np.intersect1d(self.sns, other.sns))
+
+    def __or__(self, other: Query) -> Query:
+        assert self.tt is other.tt
+        assert self.sns is not None
+        assert other.sns is not None
+        return Query(self.tt, np.union1d(self.sns, other.sns))
+
+
+# Script
+def main():
+
+    try:
+        tt = Timetable(
+            np.load(Path(__file__).parents[2] / "data/cache.mca.npy"),
+        )
+    except FileNotFoundError:
+        tt = Timetable.from_zipfile()
+        np.save(Path(__file__).parents[2] / "data/cache.mca", tt.array)
+
+    result = Query(tt).on_date("260526").calls("CRMLNGT")
+    print(result.as_ux())
+    print(result.get_field(BX().retail_service_id))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,622 @@
+# This file is generated by parsing.generate_mca_stubs.
+# Do not modify by hand.
+
+# Imports
+from dataclasses import dataclass
+
+
+# Classes
+@dataclass
+class BS:
+    def record_identity(self):
+        """With the constant value ‘BS’."""
+        return slice(0, 2)
+
+    def transaction_type(self):
+        """‘N’ = New. ‘D’ = Delete. ‘R’ = Revise."""
+        return slice(2, 3)
+
+    def train_uid(self):
+        """Unique train Identifier."""
+        return slice(3, 9)
+
+    def date_runs_from(self):
+        """yymmdd"""
+        return slice(9, 15)
+
+    def date_runs_to(self):
+        """yymmdd"""
+        return slice(15, 21)
+
+    def days_run(self):
+        """No description."""
+        return slice(21, 28)
+
+    def bank_holiday_running(self):
+        """No description."""
+        return slice(28, 29)
+
+    def train_status(self):
+        """No description."""
+        return slice(29, 30)
+
+    def train_category(self):
+        """No description."""
+        return slice(30, 32)
+
+    def train_identity(self):
+        """No description."""
+        return slice(32, 36)
+
+    def headcode(self):
+        """No description."""
+        return slice(36, 40)
+
+    def course_indicator(self):
+        """Not used - always set to 1."""
+        return slice(40, 41)
+
+    def profit_centre_code(self):
+        """No description."""
+        return slice(41, 49)
+
+    def business_sector(self):
+        """Now used to contain the portion suffix for RSID"""
+        return slice(49, 50)
+
+    def power_type(self):
+        """No description."""
+        return slice(50, 53)
+
+    def timing_load(self):
+        """No description."""
+        return slice(53, 57)
+
+    def speed(self):
+        """No description."""
+        return slice(57, 60)
+
+    def operating_chars(self):
+        """No description."""
+        return slice(60, 66)
+
+    def train_class(self):
+        """No description."""
+        return slice(66, 67)
+
+    def sleepers(self):
+        """No description."""
+        return slice(67, 68)
+
+    def reservations(self):
+        """Permitted values are: A – Seat Reservations Compulsory (R symbol in white box) E – Reservations for Bicycles Essential (Inverted black triangle) R – Seat Reservations Recommended (R symbol in black box) S – Seat Reservations possible from any station (white diamond symbol)"""
+        return slice(68, 69)
+
+    def connect_indicator(self):
+        """Not used - always set to blank."""
+        return slice(69, 70)
+
+    def catering_code(self):
+        """No description."""
+        return slice(70, 74)
+
+    def service_branding(self):
+        """No description."""
+        return slice(74, 78)
+
+    def spare(self):
+        """No description."""
+        return slice(78, 79)
+
+    def stp_indicator(self):
+        """‘C’ = STP cancellation of permanent schedule. ‘N’ = New STP schedule. ‘O’ = STP overlay of permanent schedule. ‘P’ = Permanent. Read in association with the Transaction Type in Field 2"""
+        return slice(79, 80)
+
+
+@dataclass
+class HD:
+    def record_identity(self):
+        """With the constant value ‘HD’."""
+        return slice(0, 2)
+
+    def file_identity(self):
+        """No description."""
+        return slice(2, 22)
+
+    def date_of_extract(self):
+        """Format ddmmyy defining the date that the BTD extract file was created."""
+        return slice(22, 28)
+
+    def time_of_extract(self):
+        """hhmm defining the time that the BTD extract file was created."""
+        return slice(28, 32)
+
+    def current_file_reference(self):
+        """Unique file reference."""
+        return slice(32, 39)
+
+    def last_file_reference(self):
+        """Unique file reference."""
+        return slice(39, 46)
+
+    def update_indicator(self):
+        """‘U’=Update. ‘F’=Full extract."""
+        return slice(46, 47)
+
+    def version(self):
+        """Version identifier of CIF software."""
+        return slice(47, 48)
+
+    def extract_start_date(self):
+        """Same as Field 3 above."""
+        return slice(48, 54)
+
+    def extract_end_date(self):
+        """No description."""
+        return slice(54, 60)
+
+    def spare(self):
+        """No description."""
+        return slice(60, 80)
+
+
+@dataclass
+class ZZ:
+    def record_identity(self):
+        """With the constant value ‘ZZ’."""
+        return slice(0, 2)
+
+    def spare(self):
+        """No description."""
+        return slice(2, 80)
+
+
+@dataclass
+class TA:
+    def record_identity(self):
+        """With the constant value ‘TA’."""
+        return slice(0, 2)
+
+    def tiploc_code(self):
+        """A TIPLOC is 4-7 characters. If less than 7 then it will be padded by blanks."""
+        return slice(2, 9)
+
+    def capitals(self):
+        """Defines capitalisation of TIPLOC. Can be ignored for retailing/journey planners."""
+        return slice(9, 11)
+
+    def national_location_code(self):
+        """No description."""
+        return slice(11, 17)
+
+    def nlc_check_character(self):
+        """No description."""
+        return slice(17, 18)
+
+    def tps_description(self):
+        """No description."""
+        return slice(18, 44)
+
+    def stanox(self):
+        """TOPS location code."""
+        return slice(44, 49)
+
+    def po_mcp_code(self):
+        """Post Office Location Code. (Not used but may contain historic data or three blank spaces followed by 0)."""
+        return slice(49, 53)
+
+    def crs_code(self):
+        """No description."""
+        return slice(53, 56)
+
+    def description(self):
+        """Description used in LENNON."""
+        return slice(56, 72)
+
+    def new_tiploc(self):
+        """Only present if TIPLOC change."""
+        return slice(72, 79)
+
+    def spare(self):
+        """No description."""
+        return slice(79, 80)
+
+
+@dataclass
+class CR:
+    def record_identity(self):
+        """With the constant value ‘CR’."""
+        return slice(0, 2)
+
+    def location(self):
+        """TIPLOC + Suffix. Suffix is always the eighth character."""
+        return slice(2, 10)
+
+    def train_category(self):
+        """No description."""
+        return slice(10, 12)
+
+    def train_identity(self):
+        """No description."""
+        return slice(12, 16)
+
+    def headcode(self):
+        """No description."""
+        return slice(16, 20)
+
+    def course_indicator(self):
+        """No description."""
+        return slice(20, 21)
+
+    def profit_centre_code(self):
+        """No description."""
+        return slice(21, 29)
+
+    def business_sector(self):
+        """No description."""
+        return slice(29, 30)
+
+    def power_type(self):
+        """No description."""
+        return slice(30, 33)
+
+    def timing_load(self):
+        """No description."""
+        return slice(33, 37)
+
+    def speed(self):
+        """No description."""
+        return slice(37, 40)
+
+    def operating_chars(self):
+        """No description."""
+        return slice(40, 46)
+
+    def train_class(self):
+        """No description."""
+        return slice(46, 47)
+
+    def sleepers(self):
+        """No description."""
+        return slice(47, 48)
+
+    def reservations(self):
+        """No description."""
+        return slice(48, 49)
+
+    def connect_indicator(self):
+        """No description."""
+        return slice(49, 50)
+
+    def catering_code(self):
+        """No description."""
+        return slice(50, 54)
+
+    def service_branding(self):
+        """No description."""
+        return slice(54, 58)
+
+    def traction_class(self):
+        """No description."""
+        return slice(58, 62)
+
+    def uic_code(self):
+        """Only populated for trains travelling to/from Europe via the Channel Tunnel, otherwise blank."""
+        return slice(62, 67)
+
+    def retail_service_id(self):
+        """No description."""
+        return slice(67, 75)
+
+    def spare(self):
+        """No description."""
+        return slice(75, 80)
+
+
+@dataclass
+class LT:
+    def record_identity(self):
+        """With the constant value ‘LT’."""
+        return slice(0, 2)
+
+    def location(self):
+        """TIPLOC +Suffix. Suffix is always the eighth character."""
+        return slice(2, 10)
+
+    def scheduled_arrival_time(self):
+        """No description."""
+        return slice(10, 15)
+
+    def public_arrival_time(self):
+        """If there is no Public Arrival time this field will default to 0000."""
+        return slice(15, 19)
+
+    def platform(self):
+        """No description."""
+        return slice(19, 22)
+
+    def path(self):
+        """No description."""
+        return slice(22, 25)
+
+    def activity(self):
+        """Up to 6 activity codes may be present. The first 2 characters will always be TF (train finishes). If there are no other activity codes, this defaults to being an advertised arrival."""
+        return slice(25, 37)
+
+    def spare(self):
+        """No description."""
+        return slice(37, 80)
+
+
+@dataclass
+class LI:
+    def record_identity(self):
+        """With the constant value ‘LI’."""
+        return slice(0, 2)
+
+    def location(self):
+        """TIPLOC + Suffix. Suffix is always the eighth character."""
+        return slice(2, 10)
+
+    def scheduled_arrival_time(self):
+        """No description."""
+        return slice(10, 15)
+
+    def scheduled_departure_time(self):
+        """No description."""
+        return slice(15, 20)
+
+    def scheduled_pass(self):
+        """No description."""
+        return slice(20, 25)
+
+    def public_arrival(self):
+        """If there is no Public Arrival time this field will default to 0000."""
+        return slice(25, 29)
+
+    def public_departure(self):
+        """If there is no Public Departure time this field will default to 0000."""
+        return slice(29, 33)
+
+    def platform(self):
+        """No description."""
+        return slice(33, 36)
+
+    def line(self):
+        """No description."""
+        return slice(36, 39)
+
+    def path(self):
+        """No description."""
+        return slice(39, 42)
+
+    def activity(self):
+        """Up to 6 activity codes may be present."""
+        return slice(42, 54)
+
+    def engineering_allowance(self):
+        """No description."""
+        return slice(54, 56)
+
+    def pathing_allowance(self):
+        """No description."""
+        return slice(56, 58)
+
+    def performance_allowance(self):
+        """No description."""
+        return slice(58, 60)
+
+    def spare(self):
+        """No description."""
+        return slice(60, 80)
+
+
+@dataclass
+class TD:
+    def record_identity(self):
+        """With the constant value ‘TD’."""
+        return slice(0, 2)
+
+    def tiploc_code(self):
+        """No description."""
+        return slice(2, 9)
+
+    def spare(self):
+        """No description."""
+        return slice(9, 80)
+
+
+@dataclass
+class AA:
+    def record_identity(self):
+        """With the constant value ‘AA’."""
+        return slice(0, 2)
+
+    def transaction_type(self):
+        """‘N’ = New. ‘D’ = Delete. ‘R’ = Revise."""
+        return slice(2, 3)
+
+    def base_uid(self):
+        """One of the trains involved in the association. This will always be the through train, not the splitting/joining portion."""
+        return slice(3, 9)
+
+    def assoc_uid(self):
+        """The other train involved."""
+        return slice(9, 15)
+
+    def assoc_start_date(self):
+        """Format: yymmdd. May not be the same as the dates of the train schedules."""
+        return slice(15, 21)
+
+    def assoc_end_date(self):
+        """Format: yymmdd. May not be the same as the dates of the train schedules."""
+        return slice(21, 27)
+
+    def assoc_days(self):
+        """No description."""
+        return slice(27, 34)
+
+    def assoc_cat(self):
+        """The ASSOC-CAT for the base UID (first byte), followed by the ASSOC-CAT for the assoc. UID (second byte). Note: Although this field isn’t specified as having blanks in the Network Rail CIF specification, if blanks are supplied they will be carried forward. (Blanks are used to override the permanent value in overlays and cancellations). ‘JJ’ for Joining trains and ‘VV’ for Dividing trains. ‘NP’ for Next/Previous Associations may also be displayed but as this is an Operating association it should be ignored by journey planners."""
+        return slice(34, 36)
+
+    def assoc_date_ind(self):
+        """‘S’ = Standard. ‘N’ = Over-next-midnight. ‘P’ = Over-previous-midnight. Note: Although this field isn’t specified as having blanks in the Network Rail CIF specification, if blanks are supplied they will be carried forward. (Blanks are used to override the permanent value in overlays and cancellations)."""
+        return slice(36, 37)
+
+    def assoc_location(self):
+        """TIPLOC where association occurs."""
+        return slice(37, 44)
+
+    def base_location_suffix(self):
+        """Values are space or 2."""
+        return slice(44, 45)
+
+    def assoc_location_suffix(self):
+        """Values are space or 2."""
+        return slice(45, 46)
+
+    def diagram_type(self):
+        """With the constant value ‘T’."""
+        return slice(46, 47)
+
+    def association_type(self):
+        """‘P’ = Passenger use. ‘O’ = Operating use. Note: Although this field isn’t specified as having blanks in the Network Rail CIF specification, if blanks are supplied they will be carried forward. (If blank then association defaults to Operating and should be ignored by journey planners)."""
+        return slice(47, 48)
+
+    def filler(self):
+        """No description."""
+        return slice(48, 79)
+
+    def stp_indicator(self):
+        """Read in conjunction with the ‘Transaction Type’ in Field 2. ‘C’ = STP cancellation of permanent schedule. ‘N’ = New STP schedule. ‘O’ = STP overlay of permanent schedule. ‘P’ = Permanent."""
+        return slice(79, 80)
+
+
+@dataclass
+class LO:
+    def record_identity(self):
+        """With the constant value ‘LO’."""
+        return slice(0, 2)
+
+    def location(self):
+        """TIPLOC + Suffix. Suffix is always the eighth character."""
+        return slice(2, 10)
+
+    def scheduled_departure_time(self):
+        """No description."""
+        return slice(10, 15)
+
+    def public_departure_time(self):
+        """If there is no Public Departure time this field will default to 0000."""
+        return slice(15, 19)
+
+    def platform(self):
+        """No description."""
+        return slice(19, 22)
+
+    def line(self):
+        """No description."""
+        return slice(22, 25)
+
+    def engineering_allowance(self):
+        """No description."""
+        return slice(25, 27)
+
+    def pathing_allowance(self):
+        """No description."""
+        return slice(27, 29)
+
+    def activity(self):
+        """Up to 6 activity codes may be present. The first 2 characters will always be TB (train begins). If there are no other activity codes, this defaults to being an advertised departure."""
+        return slice(29, 41)
+
+    def performance_allowance(self):
+        """No description."""
+        return slice(41, 43)
+
+    def spare(self):
+        """No description."""
+        return slice(43, 80)
+
+
+@dataclass
+class BX:
+    def record_identity(self):
+        """With the constant value ‘BX’."""
+        return slice(0, 2)
+
+    def traction_class(self):
+        """Not used – always blank."""
+        return slice(2, 6)
+
+    def uic_code(self):
+        """Only populated for trains travelling to/from Europe via the Channel Tunnel, otherwise blank."""
+        return slice(6, 11)
+
+    def atoc_code(self):
+        """No description."""
+        return slice(11, 13)
+
+    def applicable_timetable_code(self):
+        """Always set to ‘Y’."""
+        return slice(13, 14)
+
+    def retail_service_id(self):
+        """No description."""
+        return slice(14, 22)
+
+    def source(self):
+        """Not used – always blank."""
+        return slice(22, 23)
+
+    def spare(self):
+        """No description."""
+        return slice(23, 80)
+
+
+@dataclass
+class TI:
+    def record_identity(self):
+        """With the constant value ‘TI’."""
+        return slice(0, 2)
+
+    def tiploc_code(self):
+        """A TIPLOC is 4-7 characters. If less than 7 then it will be padded by blanks."""
+        return slice(2, 9)
+
+    def capitals(self):
+        """Defines capitalisation of TIPLOC. Can be ignored for retailing/journey planners."""
+        return slice(9, 11)
+
+    def national_location_code(self):
+        """No description."""
+        return slice(11, 17)
+
+    def nlc_check_character(self):
+        """No description."""
+        return slice(17, 18)
+
+    def tps_description(self):
+        """No description."""
+        return slice(18, 44)
+
+    def stanox(self):
+        """TOPS location code."""
+        return slice(44, 49)
+
+    def po_mcp_code(self):
+        """Post Office Location Code. (Not used but may contain historic data or three blank spaces followed by 0)."""
+        return slice(49, 53)
+
+    def crs_code(self):
+        """No description."""
+        return slice(53, 56)
+
+    def description(self):
+        """Description used in LENNON."""
+        return slice(56, 72)
+
+    def spare(self):
+        """No description."""
+        return slice(72, 80)
@@ -65,7 +65,7 @@ def fetch_nr_token(
 def fetch_nr_timetable_files(
    config: NRConfig | None = None,  # pyright: ignore[reportRedeclaration]
    token: str | None = None,  # pyright: ignore[reportRedeclaration]
-    attempts: int = 1,
+    attempts: int = 3,
 ) -> ZipFile:
    config: NRConfig = config if config is not None else NRConfig.from_env()
    token: str = (
@@ -11,10 +11,9 @@ Aimed primarily towards producing a reduced sqlite database.
 # Imports
 from itertools import pairwise
 from pathlib import Path
-import pandas as pd
+
 import numpy as np
-import sqlite3
-import os
+import pandas as pd
 from pypdf import PageObject, PdfReader


@@ -149,67 +148,50 @@ def read_specification_table_raws(
    return tables


-def create_mca_specification_dbschema(
-    tables: dict[str, pd.DataFrame],
-    db_path: Path | None = None,
+def generate_mca_stubs(
+    data_dir: Path | None = None,
 ):
-    db_path = (
-        db_path
-        if db_path is not None
-        else Path(os.environ.get("NR_DATADIR", "~/.cache/nr_data/timetable.db"))
-    )
-    db_path.parent.mkdir(exist_ok=True, parents=True)
-    connection = sqlite3.connect(db_path)
-    cursor = connection.cursor()
-    for name, df in tables.items():
-        if (_n := name.split("_"))[0] != "MCA" or len(_n) != 2:
+    text = [
+        "# This file is generated by parsing.generate_mca_stubs.",
+        "# Do not modify by hand.",
+        "",
+        "# Imports",
+        "from dataclasses import dataclass",
+        "",
+        "# Classes",
+    ]
+    for name, df in read_specification_table_raws(data_dir).items():
+        if name[:4] != "MCA_":
            continue
-        df["Start Index"] = df["Position"].apply(lambda s: int(s.split("-")[0]) - 1)
-        df["End Index"] = df["Position"].apply(lambda s: int(s.split("-")[-1]))
-
-        _ = cursor.execute(f"DROP TABLE IF EXISTS spec_{name.lower()}")
-        _ = cursor.execute(
-            f"""
-                CREATE TABLE spec_{name.lower()}
-                ({", ".join([col.lower().replace(" ", "_") for col in df.columns])})
-            """,
-        )
-        _ = cursor.executemany(
-            f"""
-                INSERT INTO spec_{name.lower()}
-                VALUES({", ".join(["?" for _ in df.columns])})
-            """,
-            [list(row.values) for _, row in df.iterrows()],
-        )
-        connection.commit()
-    connection.close()
-    return db_path
+        text += [
+            "@dataclass",
+            f"class {name.split('_')[1]}:",
+        ]
+        for _, row in df.iterrows():
+            func_name = (
+                row["Field Description"]
+                .lower()
+                .replace(" ", "_")
+                .replace("-", "_")
+                .split("/")[0]
+                .strip()
+            )
+            start = int(row["Position"].split("-")[0]) - 1
+            stop = int(row["Position"].split("-")[-1])
+            text += [
+                "",
+                f"    def {func_name}(self):",
+                f'        """{t if (t := str(row["Notes"])) != "nan" else "No description."}"""',
+                f"        return slice({start}, {stop})",
+            ]
+        text += ["", ""]
+    with open(Path(__file__).parent / "mca_stubs.py", "w") as wf:
+        return wf.write("\n".join(text))


 # Script
-def main(
-    skip_pdf: bool = False,
-    pdf_spec_path: Path | None = None,
-    raw_spec_dir: Path | None = None,
-):
-    if not skip_pdf:
-        try:
-            tables = extract_specification_document_tables(pdf_spec_path)
-            _ = store_specification_table_raws(tables, raw_spec_dir)
-        except FileNotFoundError:
-            pass
-
-    try:
-        tables = read_specification_table_raws(raw_spec_dir)
-    except FileNotFoundError:
-        raise FileNotFoundError(
-            "The tables generated from the RSP's specification were not found. "
-            + "This means neither the cached version nor the original .pdf is available. "
-            + "Try suppling either to their default locations, or supplying custom directories. "
-            + "Manual fix: extract_specification_document_tables then store_specification_table_raws. "
-        )
-
-    _ = create_mca_specification_dbschema(tables)
+def main():
+    print(generate_mca_stubs())


 if __name__ == "__main__":
Author	SHA1	Message	Date
sam	cd81b7514a	Tracked mca_querying.py.	2026-05-26 17:33:25 +01:00
sam	c7bb1608a1	Progress towards sensible-enough numpy based querying using shared memory views rather than duplicated data.	2026-05-26 17:32:56 +01:00
sam	b9ffbdee89	Created mca_stubs.py which provides record entry details (and slice objects) for the .MCA types.	2026-05-26 11:06:28 +01:00
sam	e887cc791e	Removal of sqlite/sqlalchemy based approach - it is too slow to combine the results, even with in-memory database loading.	2026-05-26 10:17:28 +01:00
sam	0479f1e4a8	Improved a few things, querying for multiple services now runs at a tolerable speed. Would prefer if it could be improved further, will look at pre-merging tables using sql rather than pandas.	2026-05-25 21:21:53 +01:00
sam	e723109a0a	Begun creating some utility functions and noticed some limitations. Fetching one schedule at a time is too slow, and we could easily split an aggregated result.	2026-05-25 17:46:13 +01:00
sam	36aa23f464	Various minor updates, basic Schedule class. Added a SixData class to manage conversions of YYMMDD to/from more pythonic objects.	2026-05-25 14:06:35 +01:00
sam	c2633952d3	Added mca_queries.py and it's pre-generated result mca_record_types.py. The latter is for type hinting and will make writing queries to solve for schedule numbers much easier. Next will be to write tools to make hunting for desired schedules easier.	2026-05-25 13:26:11 +01:00
sam	51c4f5030c	Updated the raw_mca_... table generation to include line number from the file, and schedule number - although we may need to investigate how the last entry behaves with 'ZZ' records and any others. We don't want to inherit the technical debt of remembering this one case every time.	2026-05-23 10:38:03 +01:00
sam	f35cda6f10	Finished parsing.py initial implementation, now have a sqlite database generating >600MB of timetable records. Next will be generating sqlalchemy desriptors based on the automated specifications. If I can re-learn sqlalchemy that is.	2026-05-22 16:57:14 +01:00