Finished parsing.py initial implementation, now have a sqlite database generating >600MB of timetable records. Next will be generating sqlalchemy desriptors based on the automated specifications. If I can re-learn sqlalchemy that is.
This commit is contained in:
@@ -1,10 +1,3 @@
|
|||||||
from national_rail_timetable.nr_requests import fetch_nr_token, fetch_nr_timetable_files
|
from national_rail_timetable.parsing import main
|
||||||
from national_rail_timetable.parsing import (
|
|
||||||
extract_specification_document_tables,
|
|
||||||
store_specification_table_raws,
|
|
||||||
read_specification_table_raws,
|
|
||||||
create_mca_specification_dbschema,
|
|
||||||
main,
|
|
||||||
)
|
|
||||||
|
|
||||||
print(main())
|
print(main())
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ def fetch_nr_token(
|
|||||||
def fetch_nr_timetable_files(
|
def fetch_nr_timetable_files(
|
||||||
config: NRConfig | None = None, # pyright: ignore[reportRedeclaration]
|
config: NRConfig | None = None, # pyright: ignore[reportRedeclaration]
|
||||||
token: str | None = None, # pyright: ignore[reportRedeclaration]
|
token: str | None = None, # pyright: ignore[reportRedeclaration]
|
||||||
attempts: int = 1,
|
attempts: int = 3,
|
||||||
) -> ZipFile:
|
) -> ZipFile:
|
||||||
config: NRConfig = config if config is not None else NRConfig.from_env()
|
config: NRConfig = config if config is not None else NRConfig.from_env()
|
||||||
token: str = (
|
token: str = (
|
||||||
|
|||||||
@@ -9,14 +9,17 @@ Aimed primarily towards producing a reduced sqlite database.
|
|||||||
# pyright: reportUnknownLambdaType=false
|
# pyright: reportUnknownLambdaType=false
|
||||||
|
|
||||||
# Imports
|
# Imports
|
||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
from itertools import pairwise
|
from itertools import pairwise
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import pandas as pd
|
from zipfile import ZipFile
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import sqlite3
|
import pandas as pd
|
||||||
import os
|
|
||||||
from pypdf import PageObject, PdfReader
|
from pypdf import PageObject, PdfReader
|
||||||
|
|
||||||
|
from national_rail_timetable.nr_requests import fetch_nr_timetable_files
|
||||||
|
|
||||||
# Init.
|
# Init.
|
||||||
SPECIFICATION_TABLE_LOCATIONS = {
|
SPECIFICATION_TABLE_LOCATIONS = {
|
||||||
@@ -34,6 +37,7 @@ SPECIFICATION_TABLE_LOCATIONS = {
|
|||||||
"MCA_ZZ": (23, 1),
|
"MCA_ZZ": (23, 1),
|
||||||
}
|
}
|
||||||
DEFAULT_RAW_SPEC_DATA_DIR = Path(__file__).parents[2] / "data/specification_tables"
|
DEFAULT_RAW_SPEC_DATA_DIR = Path(__file__).parents[2] / "data/specification_tables"
|
||||||
|
DEFAULT_DB_PATH = Path.home() / ".cache/nr_data/timetable.db"
|
||||||
|
|
||||||
|
|
||||||
# Functions
|
# Functions
|
||||||
@@ -149,16 +153,21 @@ def read_specification_table_raws(
|
|||||||
return tables
|
return tables
|
||||||
|
|
||||||
|
|
||||||
def create_mca_specification_dbschema(
|
def _validate_db_path(db_path: Path | None = None):
|
||||||
tables: dict[str, pd.DataFrame],
|
|
||||||
db_path: Path | None = None,
|
|
||||||
):
|
|
||||||
db_path = (
|
db_path = (
|
||||||
db_path
|
db_path
|
||||||
if db_path is not None
|
if db_path is not None
|
||||||
else Path(os.environ.get("NR_DATADIR", "~/.cache/nr_data/timetable.db"))
|
else Path(os.environ.get("NR_DATADIR", DEFAULT_DB_PATH))
|
||||||
)
|
)
|
||||||
db_path.parent.mkdir(exist_ok=True, parents=True)
|
db_path.parent.mkdir(exist_ok=True, parents=True)
|
||||||
|
return db_path
|
||||||
|
|
||||||
|
|
||||||
|
def create_mca_specification_dbtables(
|
||||||
|
tables: dict[str, pd.DataFrame],
|
||||||
|
db_path: Path | None = None,
|
||||||
|
):
|
||||||
|
db_path = _validate_db_path(db_path)
|
||||||
connection = sqlite3.connect(db_path)
|
connection = sqlite3.connect(db_path)
|
||||||
cursor = connection.cursor()
|
cursor = connection.cursor()
|
||||||
for name, df in tables.items():
|
for name, df in tables.items():
|
||||||
@@ -179,13 +188,98 @@ def create_mca_specification_dbschema(
|
|||||||
INSERT INTO spec_{name.lower()}
|
INSERT INTO spec_{name.lower()}
|
||||||
VALUES({", ".join(["?" for _ in df.columns])})
|
VALUES({", ".join(["?" for _ in df.columns])})
|
||||||
""",
|
""",
|
||||||
[list(row.values) for _, row in df.iterrows()],
|
[tuple(row.values) for _, row in df.iterrows()],
|
||||||
)
|
)
|
||||||
connection.commit()
|
connection.commit()
|
||||||
connection.close()
|
connection.close()
|
||||||
return db_path
|
return db_path
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: There is no need for this to take minutes, investigate speed ups.
|
||||||
|
def create_mca_raw_dbtables(
|
||||||
|
zipfile: ZipFile | None = None,
|
||||||
|
db_path: Path | None = None,
|
||||||
|
allow_fetch: bool = True,
|
||||||
|
) -> dict[str, str]:
|
||||||
|
db_path = _validate_db_path(db_path)
|
||||||
|
if zipfile is None:
|
||||||
|
if allow_fetch:
|
||||||
|
zipfile = fetch_nr_timetable_files()
|
||||||
|
else:
|
||||||
|
raise RuntimeError(
|
||||||
|
"There was no zipfile provided and allow_fetch is set to False. "
|
||||||
|
+ "Please either allow automatic fetching, or supply the zipfile argument. "
|
||||||
|
+ "The package's fetching function is fetch_nr_timetable_files. "
|
||||||
|
)
|
||||||
|
|
||||||
|
connection = sqlite3.connect(db_path)
|
||||||
|
cursor = connection.cursor()
|
||||||
|
tables = [
|
||||||
|
row[0]
|
||||||
|
for row in cursor.execute(
|
||||||
|
"SELECT name FROM sqlite_master WHERE type = 'table'"
|
||||||
|
).fetchall()
|
||||||
|
if row[0][:9] == "spec_mca_"
|
||||||
|
]
|
||||||
|
if len(tables) == 0:
|
||||||
|
raise FileNotFoundError(
|
||||||
|
"No spec_mca_... tables found in given database. "
|
||||||
|
+ "Please ensure create_mca_specification_dbtables has been run successfully. "
|
||||||
|
)
|
||||||
|
mappings = {}
|
||||||
|
all_start_indexes = {}
|
||||||
|
all_end_indexes = {}
|
||||||
|
for name in tables:
|
||||||
|
mappings[name] = "raw_" + name[5:]
|
||||||
|
spec = pd.DataFrame(
|
||||||
|
cursor.execute(f"SELECT * FROM {name}").fetchall(),
|
||||||
|
columns=[d[0] for d in cursor.description],
|
||||||
|
)
|
||||||
|
all_start_indexes[mappings[name]] = spec.start_index.values
|
||||||
|
all_end_indexes[mappings[name]] = spec.end_index.values
|
||||||
|
new_columns = [
|
||||||
|
col.split("/")[0].lower().replace(" ", "_").replace("-", "_")
|
||||||
|
for col in spec.field_description
|
||||||
|
]
|
||||||
|
_ = cursor.execute(f"DROP TABLE IF EXISTS {mappings[name].lower()}")
|
||||||
|
_ = cursor.execute(
|
||||||
|
f"""
|
||||||
|
CREATE TABLE {mappings[name]}
|
||||||
|
({(", ".join(new_columns))})
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
connection.commit()
|
||||||
|
|
||||||
|
zipinfo = [
|
||||||
|
zipinfo
|
||||||
|
for name, zipinfo in zipfile.NameToInfo.items()
|
||||||
|
if name.split(".")[-1] == "MCA"
|
||||||
|
][0]
|
||||||
|
file = zipfile.open(zipinfo)
|
||||||
|
counter = 0
|
||||||
|
while (line := file.readline().decode()) != "":
|
||||||
|
record_type = line[:2]
|
||||||
|
target_table = f"raw_mca_{record_type.lower()}"
|
||||||
|
start_indexes = all_start_indexes.get(target_table)
|
||||||
|
end_indexes = all_end_indexes.get(target_table)
|
||||||
|
if start_indexes is None or end_indexes is None:
|
||||||
|
continue
|
||||||
|
values = ", ".join(
|
||||||
|
[
|
||||||
|
"'" + line[lb:ub].replace("'", "") + "'"
|
||||||
|
for lb, ub in zip(start_indexes, end_indexes, strict=True)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
_ = cursor.execute(f"INSERT INTO {target_table} VALUES({values})")
|
||||||
|
counter += 1
|
||||||
|
if counter % 1111 == 0:
|
||||||
|
print(f" {counter:,}", end="\r")
|
||||||
|
print()
|
||||||
|
connection.commit()
|
||||||
|
connection.close()
|
||||||
|
return mappings
|
||||||
|
|
||||||
|
|
||||||
# Script
|
# Script
|
||||||
def main(
|
def main(
|
||||||
skip_pdf: bool = False,
|
skip_pdf: bool = False,
|
||||||
@@ -209,8 +303,9 @@ def main(
|
|||||||
+ "Manual fix: extract_specification_document_tables then store_specification_table_raws. "
|
+ "Manual fix: extract_specification_document_tables then store_specification_table_raws. "
|
||||||
)
|
)
|
||||||
|
|
||||||
_ = create_mca_specification_dbschema(tables)
|
_ = create_mca_specification_dbtables(tables)
|
||||||
|
return create_mca_raw_dbtables()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
_ = main()
|
||||||
|
|||||||
Reference in New Issue
Block a user