[
Reading Data from File¶
Let us read both orders as well as order_items data set from files into Pandas Data Frame.
- Both the files does not have header and hence we need to pass the schema while creating data frames.
In [1]:
# !ls -ltr /data/retail_db/orders/part-00000
In [2]:
# !head /data/retail_db/orders/part-00000
In [3]:
import pandas as pd
def get_df(path, schema):
df = pd.read_csv(
path,
header=None,
names=schema
)
return df
In [4]:
orders_path = "/data/retail_db/orders/part-00000"
orders_schema = [
"order_id",
"order_date",
"order_customer_id",
"order_status"
]
orders = get_df(orders_path, orders_schema)
In [5]:
# type(orders)
In [6]:
# orders.head(3)
{note}
When it comes to loading data into database using `cursor.executemany`, we need to pass data as list of tuples or list of lists (not as Pandas Dataframe). We can use `orders.values.tolist()` to convert records in the Pandas Dataframe to list of lists.
In [7]:
# orders.values.tolist()[:3]
In [8]:
# type(orders.values.tolist())
In [9]:
# type(orders.values.tolist()[2])
In [10]:
order_items_path = "/data/retail_db/order_items/part-00000"
order_items_schema = [
"order_item_id",
"order_item_order_id",
"order_item_product_id",
"order_item_quantity",
"order_item_subtotal",
"order_item_product_price"
]
order_items = get_df(order_items_path, order_items_schema)
In [11]:
# order_items.head(3)
]