Validate myReduceByKey Function¶
Let us perform few tasks to validate myReduceKey Function.
In [1]:
%run 04_develop_myMap_function.ipynb
In [2]:
%run 08_develop_myReduceByKey_function.ipynb
- Use the function to get the count by date from orders.
In [3]:
orders_path = "/data/retail_db/orders/part-00000"
In [4]:
orders = open(orders_path). \
read(). \
splitlines()
In [5]:
orders[:10]
Out[5]:
['1,2013-07-25 00:00:00.0,11599,CLOSED', '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT', '3,2013-07-25 00:00:00.0,12111,COMPLETE', '4,2013-07-25 00:00:00.0,8827,CLOSED', '5,2013-07-25 00:00:00.0,11318,COMPLETE', '6,2013-07-25 00:00:00.0,7130,COMPLETE', '7,2013-07-25 00:00:00.0,4530,COMPLETE', '8,2013-07-25 00:00:00.0,2911,PROCESSING', '9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT', '10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT']
In [6]:
orders_map = myMap(orders,
lambda order: (order.split(',')[1], 1)
)
orders_map[:10]
Out[6]:
[('2013-07-25 00:00:00.0', 1), ('2013-07-25 00:00:00.0', 1), ('2013-07-25 00:00:00.0', 1), ('2013-07-25 00:00:00.0', 1), ('2013-07-25 00:00:00.0', 1), ('2013-07-25 00:00:00.0', 1), ('2013-07-25 00:00:00.0', 1), ('2013-07-25 00:00:00.0', 1), ('2013-07-25 00:00:00.0', 1), ('2013-07-25 00:00:00.0', 1)]
In [7]:
order_count_by_date = myReduceByKey(orders_map,
lambda t, e: t + e
)
In [8]:
order_count_by_date[:10]
Out[8]:
[('2013-07-25 00:00:00.0', 143), ('2013-07-26 00:00:00.0', 269), ('2013-07-27 00:00:00.0', 202), ('2013-07-28 00:00:00.0', 187), ('2013-07-29 00:00:00.0', 253), ('2013-07-30 00:00:00.0', 227), ('2013-07-31 00:00:00.0', 252), ('2013-08-01 00:00:00.0', 246), ('2013-08-02 00:00:00.0', 224), ('2013-08-03 00:00:00.0', 183)]
In [9]:
len(orders)
Out[9]:
68883
In [10]:
len(order_count_by_date)
Out[10]:
364
- Use the function to get the revenue for each order id.
In [11]:
order_items_path = "/data/retail_db/order_items/part-00000"
order_items = open(order_items_path). \
read(). \
splitlines()
In [12]:
order_items[:10]
Out[12]:
['1,1,957,1,299.98,299.98', '2,2,1073,1,199.99,199.99', '3,2,502,5,250.0,50.0', '4,2,403,1,129.99,129.99', '5,4,897,2,49.98,24.99', '6,4,365,5,299.95,59.99', '7,4,502,3,150.0,50.0', '8,4,1014,4,199.92,49.98', '9,5,957,1,299.98,299.98', '10,5,365,5,299.95,59.99']
In [13]:
order_items_map = myMap(order_items,
lambda order_item: (int(order_item.split(',')[1]),
float(order_item.split(',')[4])
)
)
In [14]:
order_items_map[:10]
Out[14]:
[(1, 299.98), (2, 199.99), (2, 250.0), (2, 129.99), (4, 49.98), (4, 299.95), (4, 150.0), (4, 199.92), (5, 299.98), (5, 299.95)]
In [15]:
revenue_per_order = myReduceByKey(order_items_map,
lambda t, e: round(t + e, 2)
)
In [16]:
revenue_per_order[:10]
Out[16]:
[(1, 299.98), (2, 579.98), (4, 699.85), (5, 1129.86), (7, 579.92), (8, 729.84), (9, 599.96), (10, 651.92), (11, 919.79), (12, 1299.87)]
In [17]:
myReduceByKey(order_items_map,
lambda t, e: min(t, e)
)[:10]
Out[17]:
[(1, 299.98), (2, 129.99), (4, 49.98), (5, 99.96), (7, 79.95), (8, 50.0), (9, 199.98), (10, 21.99), (11, 49.98), (12, 100.0)]
- Use the function to get the revenue as well as the number of items for each order id.
In [18]:
order_items[:10]
Out[18]:
['1,1,957,1,299.98,299.98', '2,2,1073,1,199.99,199.99', '3,2,502,5,250.0,50.0', '4,2,403,1,129.99,129.99', '5,4,897,2,49.98,24.99', '6,4,365,5,299.95,59.99', '7,4,502,3,150.0,50.0', '8,4,1014,4,199.92,49.98', '9,5,957,1,299.98,299.98', '10,5,365,5,299.95,59.99']
In [19]:
order_items_map = myMap(order_items,
lambda order_item: (int(order_item.split(',')[1]),
(float(order_item.split(',')[4]), 1)
)
)
In [20]:
order_items_map[:10]
Out[20]:
[(1, (299.98, 1)), (2, (199.99, 1)), (2, (250.0, 1)), (2, (129.99, 1)), (4, (49.98, 1)), (4, (299.95, 1)), (4, (150.0, 1)), (4, (199.92, 1)), (5, (299.98, 1)), (5, (299.95, 1))]
In [21]:
[2, [(199.99, 1), (250.0, 1), (129.99, 1)]]
Out[21]:
[2, [(199.99, 1), (250.0, 1), (129.99, 1)]]
In [22]:
t1 = (199.99, 1)
t2 = (250.0, 1)
(t1[0] + t2[0], t1[1] + t2[1])
Out[22]:
(449.99, 2)
In [23]:
myReduceByKey(order_items_map,
lambda t, e: (round(t[0] + e[0], 2), t[1] + e[1])
)[:10]
Out[23]:
[(1, (299.98, 1)), (2, (579.98, 3)), (4, (699.85, 4)), (5, (1129.86, 5)), (7, (579.92, 3)), (8, (729.84, 4)), (9, (599.96, 3)), (10, (651.92, 5)), (11, (919.79, 5)), (12, (1299.87, 5))]