From d63f151c9b3c9ca3fbe301e25820f84d3506a05a Mon Sep 17 00:00:00 2001
From: Samuel Jones <samuel@williamjones.me>
Date: Fri, 22 May 2026 11:59:45 +0100
Subject: [PATCH] Added sqlite export of .MCA file's record spec. This won't
 live in /data, but in a user's cache. This is to allow user choice on how and
 when to update the timetable files and reduce redundancy.

---
 src/national_rail_timetable/__main__.py |  8 ++-
 src/national_rail_timetable/parsing.py  | 70 +++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 5 deletions(-)

diff --git a/src/national_rail_timetable/__main__.py b/src/national_rail_timetable/__main__.py
index b6f6d71..0b79a3f 100644
--- a/src/national_rail_timetable/__main__.py
+++ b/src/national_rail_timetable/__main__.py
@@ -3,10 +3,8 @@ from national_rail_timetable.parsing import (
     extract_specification_document_tables,
     store_specification_table_raws,
     read_specification_table_raws,
+    create_mca_specification_dbschema,
+    main,
 )
 
-# print(fetch_nr_token())
-# print(fetch_nr_timetable_files())
-tables = extract_specification_document_tables()
-print(store_specification_table_raws(tables))
-print(read_specification_table_raws())
+print(main())
diff --git a/src/national_rail_timetable/parsing.py b/src/national_rail_timetable/parsing.py
index d732fee..f7007a3 100644
--- a/src/national_rail_timetable/parsing.py
+++ b/src/national_rail_timetable/parsing.py
@@ -6,12 +6,15 @@ Aimed primarily towards producing a reduced sqlite database.
 # pyright: reportUnknownVariableType=false
 # pyright: reportUnknownArgumentType=false
 # pyright: reportUnknownMemberType=false
+# pyright: reportUnknownLambdaType=false
 
 # Imports
 from itertools import pairwise
 from pathlib import Path
 import pandas as pd
 import numpy as np
+import sqlite3
+import os
 from pypdf import PageObject, PdfReader
 
 
@@ -144,3 +147,70 @@ def read_specification_table_raws(
             continue
         tables[path.name[:-4]] = pd.read_csv(path)
     return tables
+
+
+def create_mca_specification_dbschema(
+    tables: dict[str, pd.DataFrame],
+    db_path: Path | None = None,
+):
+    db_path = (
+        db_path
+        if db_path is not None
+        else Path(os.environ.get("NR_DATADIR", "~/.cache/nr_data/timetable.db"))
+    )
+    db_path.parent.mkdir(exist_ok=True, parents=True)
+    connection = sqlite3.connect(db_path)
+    cursor = connection.cursor()
+    for name, df in tables.items():
+        if (_n := name.split("_"))[0] != "MCA" or len(_n) != 2:
+            continue
+        df["Start Index"] = df["Position"].apply(lambda s: int(s.split("-")[0]) - 1)
+        df["End Index"] = df["Position"].apply(lambda s: int(s.split("-")[-1]))
+
+        _ = cursor.execute(f"DROP TABLE IF EXISTS spec_{name.lower()}")
+        _ = cursor.execute(
+            f"""
+                CREATE TABLE spec_{name.lower()}
+                ({", ".join([col.lower().replace(" ", "_") for col in df.columns])})
+            """,
+        )
+        _ = cursor.executemany(
+            f"""
+                INSERT INTO spec_{name.lower()}
+                VALUES({", ".join(["?" for _ in df.columns])})
+            """,
+            [list(row.values) for _, row in df.iterrows()],
+        )
+        connection.commit()
+    connection.close()
+    return db_path
+
+
+# Script
+def main(
+    skip_pdf: bool = False,
+    pdf_spec_path: Path | None = None,
+    raw_spec_dir: Path | None = None,
+):
+    if not skip_pdf:
+        try:
+            tables = extract_specification_document_tables(pdf_spec_path)
+            _ = store_specification_table_raws(tables, raw_spec_dir)
+        except FileNotFoundError:
+            pass
+
+    try:
+        tables = read_specification_table_raws(raw_spec_dir)
+    except FileNotFoundError:
+        raise FileNotFoundError(
+            "The tables generated from the RSP's specification were not found. "
+            + "This means neither the cached version nor the original .pdf is available. "
+            + "Try suppling either to their default locations, or supplying custom directories. "
+            + "Manual fix: extract_specification_document_tables then store_specification_table_raws. "
+        )
+
+    _ = create_mca_specification_dbschema(tables)
+
+
+if __name__ == "__main__":
+    main()