tilde: ~ alt gr and +
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
df = pd.read_csv("/content/drive/MyDrive/Sales_Data/Sales_April_2019.csv")
df.head()
Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | |
---|---|---|---|---|---|---|
0 | 176558 | USB-C Charging Cable | 2 | 11.95 | 04/19/19 08:46 | 917 1st St, Dallas, TX 75001 |
1 | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 176559 | Bose SoundSport Headphones | 1 | 99.99 | 04/07/19 22:30 | 682 Chestnut St, Boston, MA 02215 |
3 | 176560 | Google Phone | 1 | 600 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 |
4 | 176560 | Wired Headphones | 1 | 11.99 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 |
df = pd.read_csv("/content/drive/MyDrive/Sales_Data/Sales_April_2019.csv")
files = [file for file in os.listdir('/content/drive/MyDrive/Sales_Data/')]
all_months_data = pd.DataFrame()
for file in files:
df = pd.read_csv("/content/drive/MyDrive/Sales_Data/"+file)
all_months_data = pd.concat([all_months_data, df])
all_months_data.to_csv('all_months.csv', index=False)
all_months = pd.read_csv('all_months.csv')
all_months.head()
Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | |
---|---|---|---|---|---|---|
0 | 248151 | AA Batteries (4-pack) | 4 | 3.84 | 09/17/19 14:44 | 380 North St, Los Angeles, CA 90001 |
1 | 248152 | USB-C Charging Cable | 2 | 11.95 | 09/29/19 10:19 | 511 8th St, Austin, TX 73301 |
2 | 248153 | USB-C Charging Cable | 1 | 11.95 | 09/16/19 17:48 | 151 Johnson St, Los Angeles, CA 90001 |
3 | 248154 | 27in FHD Monitor | 1 | 149.99 | 09/27/19 07:52 | 355 Hickory St, Seattle, WA 98101 |
4 | 248155 | USB-C Charging Cable | 1 | 11.95 | 09/01/19 19:03 | 125 5th St, Atlanta, GA 30301 |
Drop rows of NaN
nan_df = all_months[all_months.isna().any(axis=1)]
nan_df.head()
Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | |
---|---|---|---|---|---|---|
198 | NaN | NaN | NaN | NaN | NaN | NaN |
574 | NaN | NaN | NaN | NaN | NaN | NaN |
776 | NaN | NaN | NaN | NaN | NaN | NaN |
1383 | NaN | NaN | NaN | NaN | NaN | NaN |
1436 | NaN | NaN | NaN | NaN | NaN | NaN |
all_months = all_months.dropna(how='all')
all_months.head()
Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | |
---|---|---|---|---|---|---|
0 | 248151 | AA Batteries (4-pack) | 4 | 3.84 | 09/17/19 14:44 | 380 North St, Los Angeles, CA 90001 |
1 | 248152 | USB-C Charging Cable | 2 | 11.95 | 09/29/19 10:19 | 511 8th St, Austin, TX 73301 |
2 | 248153 | USB-C Charging Cable | 1 | 11.95 | 09/16/19 17:48 | 151 Johnson St, Los Angeles, CA 90001 |
3 | 248154 | 27in FHD Monitor | 1 | 149.99 | 09/27/19 07:52 | 355 Hickory St, Seattle, WA 98101 |
4 | 248155 | USB-C Charging Cable | 1 | 11.95 | 09/01/19 19:03 | 125 5th St, Atlanta, GA 30301 |
temp_df = all_months[all_months['Order Date'].str[0:2] == 'Or']
temp_df.head()
Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | |
---|---|---|---|---|---|---|
1679 | Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address |
3732 | Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address |
3790 | Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address |
4323 | Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address |
5579 | Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address |
all_months = all_months[all_months['Order Date'].str[0:2] != 'Or']
all_months.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 185950 entries, 0 to 186849 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order ID 185950 non-null object 1 Product 185950 non-null object 2 Quantity Ordered 185950 non-null object 3 Price Each 185950 non-null object 4 Order Date 185950 non-null object 5 Purchase Address 185950 non-null object dtypes: object(6) memory usage: 9.9+ MB
all_months['Quantity Ordered'] = pd.to_numeric(all_months['Quantity Ordered']) # Make int
all_months['Price Each'] = pd.to_numeric(all_months['Price Each']) # Make float
all_months.head()
Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | |
---|---|---|---|---|---|---|
0 | 248151 | AA Batteries (4-pack) | 4 | 3.84 | 09/17/19 14:44 | 380 North St, Los Angeles, CA 90001 |
1 | 248152 | USB-C Charging Cable | 2 | 11.95 | 09/29/19 10:19 | 511 8th St, Austin, TX 73301 |
2 | 248153 | USB-C Charging Cable | 1 | 11.95 | 09/16/19 17:48 | 151 Johnson St, Los Angeles, CA 90001 |
3 | 248154 | 27in FHD Monitor | 1 | 149.99 | 09/27/19 07:52 | 355 Hickory St, Seattle, WA 98101 |
4 | 248155 | USB-C Charging Cable | 1 | 11.95 | 09/01/19 19:03 | 125 5th St, Atlanta, GA 30301 |
all_months['Order Date'][0]
'09/17/19 14:44'
all_months['Month'] = all_months['Order Date'].str[0:2]
all_months['Month'] = all_months['Month'].astype('int32')
all_months.head()
Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | |
---|---|---|---|---|---|---|---|
0 | 248151 | AA Batteries (4-pack) | 4 | 3.84 | 09/17/19 14:44 | 380 North St, Los Angeles, CA 90001 | 9 |
1 | 248152 | USB-C Charging Cable | 2 | 11.95 | 09/29/19 10:19 | 511 8th St, Austin, TX 73301 | 9 |
2 | 248153 | USB-C Charging Cable | 1 | 11.95 | 09/16/19 17:48 | 151 Johnson St, Los Angeles, CA 90001 | 9 |
3 | 248154 | 27in FHD Monitor | 1 | 149.99 | 09/27/19 07:52 | 355 Hickory St, Seattle, WA 98101 | 9 |
4 | 248155 | USB-C Charging Cable | 1 | 11.95 | 09/01/19 19:03 | 125 5th St, Atlanta, GA 30301 | 9 |
all_months['Sales'] = all_months['Quantity Ordered'] * all_months['Price Each']
all_months.head()
Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | Sales | |
---|---|---|---|---|---|---|---|---|
0 | 248151 | AA Batteries (4-pack) | 4 | 3.84 | 09/17/19 14:44 | 380 North St, Los Angeles, CA 90001 | 9 | 15.36 |
1 | 248152 | USB-C Charging Cable | 2 | 11.95 | 09/29/19 10:19 | 511 8th St, Austin, TX 73301 | 9 | 23.90 |
2 | 248153 | USB-C Charging Cable | 1 | 11.95 | 09/16/19 17:48 | 151 Johnson St, Los Angeles, CA 90001 | 9 | 11.95 |
3 | 248154 | 27in FHD Monitor | 1 | 149.99 | 09/27/19 07:52 | 355 Hickory St, Seattle, WA 98101 | 9 | 149.99 |
4 | 248155 | USB-C Charging Cable | 1 | 11.95 | 09/01/19 19:03 | 125 5th St, Atlanta, GA 30301 | 9 | 11.95 |
# Let's use .apply()
all_months['City'] = all_months['Purchase Address'].apply(lambda x: x.split(',')[1])
all_months.head()
Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | Sales | City | |
---|---|---|---|---|---|---|---|---|---|
0 | 248151 | AA Batteries (4-pack) | 4 | 3.84 | 09/17/19 14:44 | 380 North St, Los Angeles, CA 90001 | 9 | 15.36 | Los Angeles |
1 | 248152 | USB-C Charging Cable | 2 | 11.95 | 09/29/19 10:19 | 511 8th St, Austin, TX 73301 | 9 | 23.90 | Austin |
2 | 248153 | USB-C Charging Cable | 1 | 11.95 | 09/16/19 17:48 | 151 Johnson St, Los Angeles, CA 90001 | 9 | 11.95 | Los Angeles |
3 | 248154 | 27in FHD Monitor | 1 | 149.99 | 09/27/19 07:52 | 355 Hickory St, Seattle, WA 98101 | 9 | 149.99 | Seattle |
4 | 248155 | USB-C Charging Cable | 1 | 11.95 | 09/01/19 19:03 | 125 5th St, Atlanta, GA 30301 | 9 | 11.95 | Atlanta |
C'est une fonction puissante
# Let's use .apply()
def get_city(address):
return address.split(',')[1]
all_months['City'] = all_months['Purchase Address'].apply(lambda x: get_city(x))
### C'est une fonction puissante qu'on peut utiliser comme on veut.
results = all_months.groupby('Month').sum()
results
Quantity Ordered | Price Each | Sales | |
---|---|---|---|
Month | |||
1 | 10903 | 1811768.38 | 1822256.73 |
2 | 13449 | 2188884.72 | 2202022.42 |
3 | 17005 | 2791207.83 | 2807100.38 |
4 | 20558 | 3367671.02 | 3390670.24 |
5 | 18667 | 3135125.13 | 3152606.75 |
6 | 15253 | 2562025.61 | 2577802.26 |
7 | 16072 | 2632539.56 | 2647775.76 |
8 | 13448 | 2230345.42 | 2244467.88 |
9 | 13109 | 2084992.09 | 2097560.13 |
10 | 22703 | 3715554.83 | 3736726.88 |
11 | 19798 | 3180600.68 | 3199603.20 |
12 | 28114 | 4588415.41 | 4613443.34 |
results.plot(kind='bar', figsize=(15,6));
We can see that Dezember was the best sale month with over 4.613443 dollars and the worst month was January with 1.822257 dollars
import matplotlib.pyplot as plt
months = range(1,13)
plt.rcParams["figure.figsize"] = (15,6)
plt.bar(months, results['Sales'])
plt.xticks(months)
plt.ylabel('Sales in USD ($)')
plt.xlabel('Month number')
#on peut également écrire plt.show()
Text(0.5, 0, 'Month number')
result2 = all_months.groupby('City').sum()
result2
Quantity Ordered | Price Each | Month | Sales | |
---|---|---|---|---|
City | ||||
Atlanta | 16602 | 2779908.20 | 104794 | 2795498.58 |
Austin | 11153 | 1809873.61 | 69829 | 1819581.75 |
Boston | 22528 | 3637409.77 | 141112 | 3661642.01 |
Dallas | 16730 | 2752627.82 | 104620 | 2767975.40 |
Los Angeles | 33289 | 5421435.23 | 208325 | 5452570.80 |
New York City | 27932 | 4635370.83 | 175741 | 4664317.43 |
Portland | 14053 | 2307747.47 | 87765 | 2320490.61 |
San Francisco | 50239 | 8211461.74 | 315520 | 8262203.91 |
Seattle | 16553 | 2733296.01 | 104941 | 2747755.48 |
result2.plot(kind='bar', figsize=(15,6));
import matplotlib.pyplot as plt
cities = [city for city, df in all_months.groupby('City')]
plt.bar(cities, result2['Sales'])
plt.xticks(cities, rotation='vertical', size=8)
plt.ylabel('Sales in USD ($)')
plt.xlabel('City name')
#on peut également écrire plt.show()
Text(0.5, 0, 'City name')
all_months.head()
Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | Sales | City | |
---|---|---|---|---|---|---|---|---|---|
0 | 248151 | AA Batteries (4-pack) | 4 | 3.84 | 09/17/19 14:44 | 380 North St, Los Angeles, CA 90001 | 9 | 15.36 | Los Angeles |
1 | 248152 | USB-C Charging Cable | 2 | 11.95 | 09/29/19 10:19 | 511 8th St, Austin, TX 73301 | 9 | 23.90 | Austin |
2 | 248153 | USB-C Charging Cable | 1 | 11.95 | 09/16/19 17:48 | 151 Johnson St, Los Angeles, CA 90001 | 9 | 11.95 | Los Angeles |
3 | 248154 | 27in FHD Monitor | 1 | 149.99 | 09/27/19 07:52 | 355 Hickory St, Seattle, WA 98101 | 9 | 149.99 | Seattle |
4 | 248155 | USB-C Charging Cable | 1 | 11.95 | 09/01/19 19:03 | 125 5th St, Atlanta, GA 30301 | 9 | 11.95 | Atlanta |
all_months['Order Date'] = pd.to_datetime(all_months['Order Date'])
all_months.head()
Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | Sales | City | |
---|---|---|---|---|---|---|---|---|---|
0 | 248151 | AA Batteries (4-pack) | 4 | 3.84 | 2019-09-17 14:44:00 | 380 North St, Los Angeles, CA 90001 | 9 | 15.36 | Los Angeles |
1 | 248152 | USB-C Charging Cable | 2 | 11.95 | 2019-09-29 10:19:00 | 511 8th St, Austin, TX 73301 | 9 | 23.90 | Austin |
2 | 248153 | USB-C Charging Cable | 1 | 11.95 | 2019-09-16 17:48:00 | 151 Johnson St, Los Angeles, CA 90001 | 9 | 11.95 | Los Angeles |
3 | 248154 | 27in FHD Monitor | 1 | 149.99 | 2019-09-27 07:52:00 | 355 Hickory St, Seattle, WA 98101 | 9 | 149.99 | Seattle |
4 | 248155 | USB-C Charging Cable | 1 | 11.95 | 2019-09-01 19:03:00 | 125 5th St, Atlanta, GA 30301 | 9 | 11.95 | Atlanta |
all_months['Hour'] = all_months['Order Date'].dt.hour
all_months['Minute'] = all_months['Order Date'].dt.minute
all_months.head()
Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | Sales | City | Hour | Minute | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 248151 | AA Batteries (4-pack) | 4 | 3.84 | 2019-09-17 14:44:00 | 380 North St, Los Angeles, CA 90001 | 9 | 15.36 | Los Angeles | 14 | 44 |
1 | 248152 | USB-C Charging Cable | 2 | 11.95 | 2019-09-29 10:19:00 | 511 8th St, Austin, TX 73301 | 9 | 23.90 | Austin | 10 | 19 |
2 | 248153 | USB-C Charging Cable | 1 | 11.95 | 2019-09-16 17:48:00 | 151 Johnson St, Los Angeles, CA 90001 | 9 | 11.95 | Los Angeles | 17 | 48 |
3 | 248154 | 27in FHD Monitor | 1 | 149.99 | 2019-09-27 07:52:00 | 355 Hickory St, Seattle, WA 98101 | 9 | 149.99 | Seattle | 7 | 52 |
4 | 248155 | USB-C Charging Cable | 1 | 11.95 | 2019-09-01 19:03:00 | 125 5th St, Atlanta, GA 30301 | 9 | 11.95 | Atlanta | 19 | 3 |
hours = [hours for hours, df in all_months.groupby('Hour')]
plt.plot(hours, all_months.groupby(['Hour']).count())
plt.rcParams["figure.figsize"] = (15,6)
plt.xticks(hours)
plt.xlabel('Hour')
plt.ylabel('Number of Orders')
plt.grid()
What time should we display adverisements to maximize likelihood of customer's buying product? my recommendation is around 11 am (11h) or 7 pm (19h)
df = all_months[all_months['Order ID'].duplicated(keep=False)]
df['Grouped'] = df.groupby('Order ID')['Product'].transform(lambda x: ','.join(x))
df.head()
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | Sales | City | Hour | Minute | Grouped | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
20 | 248171 | USB-C Charging Cable | 1 | 11.95 | 2019-09-05 15:06:00 | 705 Hill St, New York City, NY 10001 | 9 | 11.95 | New York City | 15 | 6 | USB-C Charging Cable,USB-C Charging Cable |
21 | 248171 | USB-C Charging Cable | 1 | 11.95 | 2019-09-05 15:06:00 | 705 Hill St, New York City, NY 10001 | 9 | 11.95 | New York City | 15 | 6 | USB-C Charging Cable,USB-C Charging Cable |
60 | 248210 | iPhone | 1 | 700.00 | 2019-09-27 10:33:00 | 292 Washington St, San Francisco, CA 94016 | 9 | 700.00 | San Francisco | 10 | 33 | iPhone,Lightning Charging Cable |
61 | 248210 | Lightning Charging Cable | 1 | 14.95 | 2019-09-27 10:33:00 | 292 Washington St, San Francisco, CA 94016 | 9 | 14.95 | San Francisco | 10 | 33 | iPhone,Lightning Charging Cable |
70 | 248219 | Bose SoundSport Headphones | 1 | 99.99 | 2019-09-24 12:48:00 | 665 Maple St, Los Angeles, CA 90001 | 9 | 99.99 | Los Angeles | 12 | 48 | Bose SoundSport Headphones,Google Phone |
df = all_months[all_months['Order ID'].duplicated(keep=False)]
df['Grouped'] = df.groupby('Order ID')['Product'].transform(lambda x: ','.join(x))
df = df[['Order ID', 'Grouped']].drop_duplicates()
df.head()
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Order ID | Grouped | |
---|---|---|
20 | 248171 | USB-C Charging Cable,USB-C Charging Cable |
60 | 248210 | iPhone,Lightning Charging Cable |
70 | 248219 | Bose SoundSport Headphones,Google Phone |
117 | 248265 | Vareebadd Phone,Bose SoundSport Headphones |
176 | 248323 | Google Phone,Bose SoundSport Headphones |
# https://stackoverflow.com/questions/52195887/counting-unique-pairs-of-numbers-into-a-python-dictionary
from itertools import combinations
from collections import Counter
count = Counter()
for row in df['Grouped']:
row_list = row.split(',')
count.update(Counter(combinations(row_list, 2)))
for key, value in count.most_common(10):
print(key, value)
('iPhone', 'Lightning Charging Cable') 1005 ('Google Phone', 'USB-C Charging Cable') 987 ('iPhone', 'Wired Headphones') 447 ('Google Phone', 'Wired Headphones') 414 ('Vareebadd Phone', 'USB-C Charging Cable') 361 ('iPhone', 'Apple Airpods Headphones') 360 ('Google Phone', 'Bose SoundSport Headphones') 220 ('USB-C Charging Cable', 'Wired Headphones') 160 ('Vareebadd Phone', 'Wired Headphones') 143 ('Lightning Charging Cable', 'Wired Headphones') 92
all_months.head()
Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | Sales | City | Hour | Minute | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 248151 | AA Batteries (4-pack) | 4 | 3.84 | 2019-09-17 14:44:00 | 380 North St, Los Angeles, CA 90001 | 9 | 15.36 | Los Angeles | 14 | 44 |
1 | 248152 | USB-C Charging Cable | 2 | 11.95 | 2019-09-29 10:19:00 | 511 8th St, Austin, TX 73301 | 9 | 23.90 | Austin | 10 | 19 |
2 | 248153 | USB-C Charging Cable | 1 | 11.95 | 2019-09-16 17:48:00 | 151 Johnson St, Los Angeles, CA 90001 | 9 | 11.95 | Los Angeles | 17 | 48 |
3 | 248154 | 27in FHD Monitor | 1 | 149.99 | 2019-09-27 07:52:00 | 355 Hickory St, Seattle, WA 98101 | 9 | 149.99 | Seattle | 7 | 52 |
4 | 248155 | USB-C Charging Cable | 1 | 11.95 | 2019-09-01 19:03:00 | 125 5th St, Atlanta, GA 30301 | 9 | 11.95 | Atlanta | 19 | 3 |
product_group = all_months.groupby('Product')
quantity_ordered = product_group.sum()['Quantity Ordered']
products = [product for product, df in product_group]
plt.bar(products, quantity_ordered)
plt.ylabel('Quantity Ordered')
plt.xlabel('Product')
plt.xticks(products, rotation='vertical', size=8)
plt.show()
prices = all_months.groupby('Product').mean()['Price Each']
print(prices)
Product 20in Monitor 109.99 27in 4K Gaming Monitor 389.99 27in FHD Monitor 149.99 34in Ultrawide Monitor 379.99 AA Batteries (4-pack) 3.84 AAA Batteries (4-pack) 2.99 Apple Airpods Headphones 150.00 Bose SoundSport Headphones 99.99 Flatscreen TV 300.00 Google Phone 600.00 LG Dryer 600.00 LG Washing Machine 600.00 Lightning Charging Cable 14.95 Macbook Pro Laptop 1700.00 ThinkPad Laptop 999.99 USB-C Charging Cable 11.95 Vareebadd Phone 400.00 Wired Headphones 11.99 iPhone 700.00 Name: Price Each, dtype: float64
prices = all_months.groupby('Product').mean()['Price Each']
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.bar(products, quantity_ordered, color='g')
ax2.plot(products, prices, 'b-')
ax1.set_xlabel('Product Name')
ax1.set_ylabel('Quantity Ordered', color='g')
ax2.set_ylabel('Price ($)', color='b')
ax1.set_xticklabels(products, rotation='vertical', size=8)
plt.show()