Skip to content

spreadsheets

excel_to_sqlite

excel_to_sqlite(spreadsheet: str | upath.UPath, save_path: str | upath.UPath) -> upath.UPath

This code uses the openpyxl package for playing around with excel using Python code to convert complete excel workbook (all sheets) to an SQLite database The code assumes that the first row of every sheet is the column name Every sheet is stored in a separate table The sheet name is assigned as the table name for every sheet.

From https://stackoverflow.com/questions/17439885/export-data-from-excel-to-sqlite-database

Source code in npc_lims/metadata/spreadsheets.py
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
def excel_to_sqlite(
    spreadsheet: str | upath.UPath,
    save_path: str | upath.UPath,
) -> upath.UPath:
    """
    This code uses the openpyxl package for playing around with excel using Python code
    to convert complete excel workbook (all sheets) to an SQLite database
    The code assumes that the first row of every sheet is the column name
    Every sheet is stored in a separate table
    The sheet name is assigned as the table name for every sheet.

    From
    https://stackoverflow.com/questions/17439885/export-data-from-excel-to-sqlite-database
    """
    spreadsheet = upath.UPath(spreadsheet)
    save_path = upath.UPath(save_path)

    db_path = tempfile.mkstemp(suffix=".sqlite")[1]
    xls_path = tempfile.mkstemp(suffix=spreadsheet.suffix)[1]
    upath.UPath(xls_path).write_bytes(spreadsheet.read_bytes())

    # Replace with a database name
    con = sqlite3.connect(db_path)

    # replace with the complete path to your excel workbook
    wb = openpyxl.load_workbook(filename=xls_path)

    def slugify(text: str, lower=1) -> str:
        if lower == 1:
            text = text.strip().lower()
        text = text.replace("d'", "dprime")
        text = re.sub(r"[^\w _-]+", "", text)
        text = re.sub(r"[- ]+", "_", text)
        return text

    for sheet in wb.sheetnames:
        ws = wb[sheet]
        columns = []
        duplicate_column_idx = []
        query = (
            "CREATE TABLE "
            + repr(str(slugify(sheet)))
            + "(ID INTEGER PRIMARY KEY AUTOINCREMENT"
        )
        for row in ws.rows:
            for idx, col in enumerate(row):
                column_name = slugify(col.value)
                if column_name not in columns:
                    query += ", " + column_name + " TEXT"
                    columns.append(column_name)
                else:
                    duplicate_column_idx.append(idx)
            break  # only want column names from first row
        query += ");"
        if not columns:
            continue

        con.execute(query)

        tup = []
        for i, col in enumerate(ws):
            tuprow = []
            if i == 0:
                continue
            for idx, col in enumerate(col):
                if idx in duplicate_column_idx:
                    continue
                tuprow.append(str(col.value).strip()) if str(
                    col.value
                ).strip() != "None" else tuprow.append("")
            tup.append(tuple(tuprow))

        insQuery1 = "INSERT INTO " + repr(str(slugify(sheet))) + "("
        insQuery2 = ""
        for col in columns:
            insQuery1 += col + ", "
            insQuery2 += "?, "
        insQuery1 = insQuery1[:-2] + ") VALUES("
        insQuery2 = insQuery2[:-2] + ")"
        insQuery = insQuery1 + insQuery2

        con.executemany(insQuery, tup)
        con.commit()

    con.close()
    save_path.write_bytes(upath.UPath(db_path).read_bytes())
    return save_path

get_training_db cached

get_training_db(nsb: bool = False) -> sqlite3.Connection

Download db to tempdir, open connection, return connection.

Examples:

>>> assert get_training_db()
Source code in npc_lims/metadata/spreadsheets.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
@functools.cache
def get_training_db(nsb: bool = False) -> sqlite3.Connection:
    """
    Download db to tempdir, open connection, return connection.

    Examples:
        >>> assert get_training_db()
    """
    db_path = upath.UPath(tempfile.mkstemp(suffix=".db")[1])
    s3_path = next(
        path for path in get_training_sqlite_paths() if ("NSB" in path.name) == nsb
    )
    db_path.write_bytes(s3_path.read_bytes())
    con = sqlite3.connect(db_path)

    def dict_factory(cursor, row):
        d = {}
        for idx, col in enumerate(cursor.description):
            d[col[0]] = row[idx]
        return d

    con.row_factory = dict_factory
    return con

get_training_spreadsheet_paths

get_training_spreadsheet_paths() -> tuple[upath.UPath, ...]

Examples:

>>> assert len(get_training_spreadsheet_paths()) > 0
Source code in npc_lims/metadata/spreadsheets.py
49
50
51
52
53
54
def get_training_spreadsheet_paths() -> tuple[upath.UPath, ...]:
    """
    Examples:
        >>> assert len(get_training_spreadsheet_paths()) > 0
    """
    return tuple(DR_DATA_REPO.parent.glob("DynamicRoutingTraining*.xlsx"))

get_training_sqlite_paths

get_training_sqlite_paths() -> tuple[upath.UPath, ...]

Examples:

>>> assert len(get_training_sqlite_paths()) == len(get_training_spreadsheet_paths())
Source code in npc_lims/metadata/spreadsheets.py
14
15
16
17
18
19
20
21
def get_training_sqlite_paths() -> tuple[upath.UPath, ...]:
    """
    Examples:
        >>> assert len(get_training_sqlite_paths()) == len(get_training_spreadsheet_paths())
    """
    return tuple(
        path.with_suffix(".sqlite") for path in get_training_spreadsheet_paths()
    )

update_training_dbs

update_training_dbs() -> None

Read spreadsheets from the data repo and write them to corresponding databases, currently sqlite files in the same directory.

Examples:

>>> update_training_dbs()
Source code in npc_lims/metadata/spreadsheets.py
57
58
59
60
61
62
63
64
65
66
67
68
def update_training_dbs() -> None:
    """
    Read spreadsheets from the data repo and write them to corresponding
    databases, currently sqlite files in the same directory.

    Examples:
        >>> update_training_dbs()
    """
    for spreadsheet, sqlite in zip(
        get_training_spreadsheet_paths(), get_training_sqlite_paths()
    ):
        excel_to_sqlite(spreadsheet, sqlite)