# import os
# # os.environ['PATH'].split(';')
# Importing packages, set data source paths
import pandas as pd
import numpy as np
breast_cancer = 'https://raw.githubusercontent.com/datamites/pandas/master/breast_cancer.csv'
adult = 'https://raw.githubusercontent.com/datamites/pandas/master/adult10.data'
book_price = 'https://raw.githubusercontent.com/datamites/pandas/master/book_price_data.xlsx'
book_reviews = 'https://raw.githubusercontent.com/datamites/pandas/master/book_reviews_price.xlsx'
hp_costs = 'https://raw.githubusercontent.com/datamites/pandas/master/hp_other_costs.csv'
house_price_bangalore = 'https://raw.githubusercontent.com/datamites/pandas/master/house_price_bangalore.csv'
mtcars = 'https://raw.githubusercontent.com/datamites/pandas/master/mtcars.csv'
mtcars_missing = 'https://raw.githubusercontent.com/datamites/pandas/master/mtcars_missing.csv'
wine= 'https://raw.githubusercontent.com/datamites/pandas/master/winequality.csv'
sales_returns = 'https://raw.githubusercontent.com/datamites/pandas/master/sales_returns.csv'
superstore_orders = 'https://raw.githubusercontent.com/datamites/pandas/master/superstore_orders.csv'
car_sales = 'https://raw.githubusercontent.com/datamites/pandas/master/car-sales.csv'
import pandas as pd
import numpy as np
hp_costs = 'https://raw.githubusercontent.com/datamites/pandas/master/hp_other_costs.csv'
data = pd.read_csv(hp_costs)
print(data.shape)
data.head()
(13320, 10)
area_type | availability | location | size | society | total_sqft | bath | balcony | other_costs | price | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Super built-up Area | 19-Dec | Electronic City Phase II | 2 BHK | Coomee | 1056 | 2.0 | 1.0 | custom 2.0L reg 1.76L misc 3.7L | 39.07 |
1 | Plot Area | Ready To Move | Chikka Tirupathi | 4 Bedroom | Theanmp | 2600 | 5.0 | 3.0 | custom 2.4L reg 5.4L | 120.00 |
2 | Built-up Area | Ready To Move | Uttarahalli | 3 BHK | NaN | 1440 | 2.0 | 3.0 | reg 2.79L misc 3.5L | 62.00 |
3 | Super built-up Area | Ready To Move | Lingadheeranahalli | 3 BHK | Soiewre | 1521 | 3.0 | 1.0 | custom 2.5L reg 4.28L pd 1.34L misc 1.6L | 95.00 |
4 | Super built-up Area | Ready To Move | Kothanur | 2 BHK | NaN | 1200 | 2.0 | 1.0 | reg 2.3L | 51.00 |
data.columns
Index(['area_type', 'availability', 'location', 'size', 'society', 'total_sqft', 'bath', 'balcony', 'other_costs', 'price'], dtype='object')
data.other_costs[0]
'custom 2.0L reg 1.76L misc 3.7L'
data.other_costs.str.findall('\d+.\d+')
0 [2.0, 1.76, 3.7] 1 [2.4, 5.4] 2 [2.79, 3.5] 3 [2.5, 4.28, 1.34, 1.6] 4 [2.3] ... 13315 [1.8, 10.4, 3.2] 13316 [4.1, 3.9] 13317 [5.6, 2.7, 2.4] 13318 [3.9, 21.96, 1.4] 13319 [3.0, 0.77, 1.9] Name: other_costs, Length: 13320, dtype: object
data['other_costs_list'] = data.other_costs.str.findall('\d+.\d+')
data.head()
area_type | availability | location | size | society | total_sqft | bath | balcony | other_costs | price | other_costs_list | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Super built-up Area | 19-Dec | Electronic City Phase II | 2 BHK | Coomee | 1056 | 2.0 | 1.0 | custom 2.0L reg 1.76L misc 3.7L | 39.07 | [2.0, 1.76, 3.7] |
1 | Plot Area | Ready To Move | Chikka Tirupathi | 4 Bedroom | Theanmp | 2600 | 5.0 | 3.0 | custom 2.4L reg 5.4L | 120.00 | [2.4, 5.4] |
2 | Built-up Area | Ready To Move | Uttarahalli | 3 BHK | NaN | 1440 | 2.0 | 3.0 | reg 2.79L misc 3.5L | 62.00 | [2.79, 3.5] |
3 | Super built-up Area | Ready To Move | Lingadheeranahalli | 3 BHK | Soiewre | 1521 | 3.0 | 1.0 | custom 2.5L reg 4.28L pd 1.34L misc 1.6L | 95.00 | [2.5, 4.28, 1.34, 1.6] |
4 | Super built-up Area | Ready To Move | Kothanur | 2 BHK | NaN | 1200 | 2.0 | 1.0 | reg 2.3L | 51.00 | [2.3] |
for i, row in data.iterrows():
data.loc[i, 'total_cost'] = row.price + sum([float(x) for x in row.other_costs_list])
data.head()
area_type | availability | location | size | society | total_sqft | bath | balcony | other_costs | price | other_costs_list | total_cost | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Super built-up Area | 19-Dec | Electronic City Phase II | 2 BHK | Coomee | 1056 | 2.0 | 1.0 | custom 2.0L reg 1.76L misc 3.7L | 39.07 | [2.0, 1.76, 3.7] | 46.53 |
1 | Plot Area | Ready To Move | Chikka Tirupathi | 4 Bedroom | Theanmp | 2600 | 5.0 | 3.0 | custom 2.4L reg 5.4L | 120.00 | [2.4, 5.4] | 127.80 |
2 | Built-up Area | Ready To Move | Uttarahalli | 3 BHK | NaN | 1440 | 2.0 | 3.0 | reg 2.79L misc 3.5L | 62.00 | [2.79, 3.5] | 68.29 |
3 | Super built-up Area | Ready To Move | Lingadheeranahalli | 3 BHK | Soiewre | 1521 | 3.0 | 1.0 | custom 2.5L reg 4.28L pd 1.34L misc 1.6L | 95.00 | [2.5, 4.28, 1.34, 1.6] | 104.72 |
4 | Super built-up Area | Ready To Move | Kothanur | 2 BHK | NaN | 1200 | 2.0 | 1.0 | reg 2.3L | 51.00 | [2.3] | 53.30 |
superstore_orders = 'https://raw.githubusercontent.com/datamites/pandas/master/superstore_orders.csv'
data2 = pd.read_csv(superstore_orders)
print(data2.shape)
data2.head(5)
(9994, 21)
row_id | order_id | order_date | ship_date | ship_mode | customer_id | customer_name | segment | country | city | ... | postal_code | region | product_id | category | sub_category | product_name | sales | quantity | discount | profit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | CA-2017-152156 | 2017-11-08 | 2017-11-11 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | ... | 42420.0 | South | FUR-BO-10001798 | Furniture | Bookcases | Bush Somerset Collection Bookcase | 261.9600 | 2 | 0.00 | 41.9136 |
1 | 2 | CA-2017-152156 | 2017-11-08 | 2017-11-11 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | ... | 42420.0 | South | FUR-CH-10000454 | Furniture | Chairs | Hon Deluxe Fabric Upholstered Stacking Chairs,... | 731.9400 | 3 | 0.00 | 219.5820 |
2 | 3 | CA-2017-138688 | 2017-06-12 | 2017-06-16 | Second Class | DV-13045 | Darrin Van Huff | Corporate | United States | Los Angeles | ... | 90036.0 | West | OFF-LA-10000240 | Office Supplies | Labels | Self-Adhesive Address Labels for Typewriters b... | 14.6200 | 2 | 0.00 | 6.8714 |
3 | 4 | US-2016-108966 | 2016-10-11 | 2016-10-18 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | ... | 33311.0 | South | FUR-TA-10000577 | Furniture | Tables | Bretford CR4500 Series Slim Rectangular Table | 957.5775 | 5 | 0.45 | -383.0310 |
4 | 5 | US-2016-108966 | 2016-10-11 | 2016-10-18 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | ... | 33311.0 | South | OFF-ST-10000760 | Office Supplies | Storage | Eldon Fold 'N Roll Cart System | 22.3680 | 2 | 0.20 | 2.5164 |
5 rows × 21 columns
data2.columns
Index(['row_id', 'order_id', 'order_date', 'ship_date', 'ship_mode', 'customer_id', 'customer_name', 'segment', 'country', 'city', 'state', 'postal_code', 'region', 'product_id', 'category', 'sub_category', 'product_name', 'sales', 'quantity', 'discount', 'profit'], dtype='object')
data2.pivot_table(columns='region', index='category', values='sales', aggfunc='sum')
region | Central | East | South | West |
---|---|---|---|---|
category | ||||
Furniture | 163797.1638 | 208291.204 | 117298.684 | 252612.7435 |
Office Supplies | 167026.4150 | 205516.055 | 125651.313 | 220853.2490 |
Technology | 170416.3120 | 264973.981 | 148771.908 | 251991.8320 |
data2.pivot_table(columns='region', index='category', values=['sales','profit'], aggfunc='sum')
profit | sales | |||||||
---|---|---|---|---|---|---|---|---|
region | Central | East | South | West | Central | East | South | West |
category | ||||||||
Furniture | -2871.0494 | 3046.1658 | 6771.2061 | 11504.9503 | 163797.1638 | 208291.204 | 117298.684 | 252612.7435 |
Office Supplies | 8879.9799 | 41014.5791 | 19986.3928 | 52609.8490 | 167026.4150 | 205516.055 | 125651.313 | 220853.2490 |
Technology | 33697.4320 | 47462.0351 | 19991.8314 | 44303.6496 | 170416.3120 | 264973.981 | 148771.908 | 251991.8320 |
data2.pivot_table(columns='region', index=['category','sub_category'], values=['sales','profit'], aggfunc='sum').style.background_gradient(cmap='coolwarm')
profit | sales | ||||||||
---|---|---|---|---|---|---|---|---|---|
region | Central | East | South | West | Central | East | South | West | |
category | sub_category | ||||||||
Furniture | Bookcases | -1997.904300 | -1167.631800 | 1339.491800 | -1646.511700 | 24157.176800 | 43819.334000 | 10899.362000 | 36004.123500 |
Chairs | 6592.722100 | 9357.770600 | 6612.089300 | 4027.584300 | 85230.646000 | 96260.683000 | 45176.446000 | 101781.328000 | |
Furnishings | -3906.216800 | 5881.407100 | 3442.682900 | 7641.270400 | 15254.370000 | 29071.380000 | 17306.684000 | 30072.730000 | |
Tables | -3559.650400 | -11025.380100 | -4623.057900 | 1482.607300 | 39154.971000 | 39139.807000 | 43916.192000 | 84754.562000 | |
Office Supplies | Appliances | -2638.617500 | 8391.413400 | 4123.939600 | 8261.269900 | 23582.033000 | 34188.466000 | 19525.326000 | 30236.336000 |
Art | 1195.159100 | 1899.944300 | 1058.586600 | 2374.097000 | 5765.340000 | 7485.764000 | 4655.622000 | 9212.066000 | |
Binders | -1043.636900 | 11267.934600 | 3900.664000 | 16096.801600 | 56923.282000 | 53497.997000 | 37030.341000 | 55961.113000 | |
Envelopes | 1777.528300 | 1812.409000 | 1465.477000 | 1908.762400 | 4636.872000 | 4375.874000 | 3345.556000 | 4118.100000 | |
Fasteners | 236.618600 | 263.990800 | 173.718100 | 275.190700 | 778.030000 | 819.718000 | 503.316000 | 923.216000 | |
Labels | 1073.079400 | 1129.280000 | 1040.772300 | 2303.122300 | 2451.472000 | 2602.934000 | 2353.180000 | 5078.726000 | |
Paper | 6971.900500 | 9015.371000 | 5947.061400 | 12119.236400 | 17491.902000 | 20172.602000 | 14150.984000 | 26663.718000 | |
Storage | 1969.836500 | 8389.371200 | 2274.296500 | 8645.322200 | 45930.112000 | 71612.584000 | 35768.060000 | 70532.852000 | |
Supplies | -661.888100 | -1155.135200 | 1.877300 | 626.046500 | 9467.372000 | 10760.116000 | 8318.928000 | 18127.122000 | |
Technology | Accessories | 7251.630600 | 11195.864400 | 7004.542400 | 16484.598300 | 33956.076000 | 45033.372000 | 27276.754000 | 61114.116000 |
Copiers | 15608.841300 | 17022.841800 | 3658.906700 | 19327.235100 | 37259.570000 | 53219.462000 | 9299.756000 | 49749.242000 | |
Machines | -1486.066600 | 6928.642900 | -1438.893000 | -618.926400 | 26797.384000 | 66106.165000 | 53890.960000 | 42444.122000 | |
Phones | 12323.026700 | 12314.686000 | 10767.275300 | 9110.742600 | 72403.282000 | 100614.982000 | 58304.438000 | 98684.352000 |
data3 = pd.read_csv(wine, delimiter=';')
print(data3.shape)
data3.head()
(4898, 12)
fixed-acidity | volatile-acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7.0 | 0.27 | 0.36 | 20.7 | 0.045 | 45.0 | 170.0 | 1.0010 | 3.00 | 0.45 | 8.8 | 6 |
1 | 6.3 | 0.30 | 0.34 | 1.6 | 0.049 | 14.0 | 132.0 | 0.9940 | 3.30 | 0.49 | 9.5 | 6 |
2 | 8.1 | 0.28 | 0.40 | 6.9 | 0.050 | 30.0 | 97.0 | 0.9951 | 3.26 | 0.44 | 10.1 | 6 |
3 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.9956 | 3.19 | 0.40 | 9.9 | 6 |
4 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.9956 | 3.19 | 0.40 | 9.9 | 6 |
data3.quality.value_counts()
6 2198 5 1457 7 880 8 175 4 163 3 20 9 5 Name: quality, dtype: int64
# Create a new column quality_category
data3['quality_category'] = pd.cut(data3.quality, bins=[0,5,7,10], labels = ['bad', 'good', 'premium'])
data3.head(20)
fixed-acidity | volatile-acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | quality_category | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7.0 | 0.27 | 0.36 | 20.70 | 0.045 | 45.0 | 170.0 | 1.0010 | 3.00 | 0.45 | 8.8 | 6 | good |
1 | 6.3 | 0.30 | 0.34 | 1.60 | 0.049 | 14.0 | 132.0 | 0.9940 | 3.30 | 0.49 | 9.5 | 6 | good |
2 | 8.1 | 0.28 | 0.40 | 6.90 | 0.050 | 30.0 | 97.0 | 0.9951 | 3.26 | 0.44 | 10.1 | 6 | good |
3 | 7.2 | 0.23 | 0.32 | 8.50 | 0.058 | 47.0 | 186.0 | 0.9956 | 3.19 | 0.40 | 9.9 | 6 | good |
4 | 7.2 | 0.23 | 0.32 | 8.50 | 0.058 | 47.0 | 186.0 | 0.9956 | 3.19 | 0.40 | 9.9 | 6 | good |
5 | 8.1 | 0.28 | 0.40 | 6.90 | 0.050 | 30.0 | 97.0 | 0.9951 | 3.26 | 0.44 | 10.1 | 6 | good |
6 | 6.2 | 0.32 | 0.16 | 7.00 | 0.045 | 30.0 | 136.0 | 0.9949 | 3.18 | 0.47 | 9.6 | 6 | good |
7 | 7.0 | 0.27 | 0.36 | 20.70 | 0.045 | 45.0 | 170.0 | 1.0010 | 3.00 | 0.45 | 8.8 | 6 | good |
8 | 6.3 | 0.30 | 0.34 | 1.60 | 0.049 | 14.0 | 132.0 | 0.9940 | 3.30 | 0.49 | 9.5 | 6 | good |
9 | 8.1 | 0.22 | 0.43 | 1.50 | 0.044 | 28.0 | 129.0 | 0.9938 | 3.22 | 0.45 | 11.0 | 6 | good |
10 | 8.1 | 0.27 | 0.41 | 1.45 | 0.033 | 11.0 | 63.0 | 0.9908 | 2.99 | 0.56 | 12.0 | 5 | bad |
11 | 8.6 | 0.23 | 0.40 | 4.20 | 0.035 | 17.0 | 109.0 | 0.9947 | 3.14 | 0.53 | 9.7 | 5 | bad |
12 | 7.9 | 0.18 | 0.37 | 1.20 | 0.040 | 16.0 | 75.0 | 0.9920 | 3.18 | 0.63 | 10.8 | 5 | bad |
13 | 6.6 | 0.16 | 0.40 | 1.50 | 0.044 | 48.0 | 143.0 | 0.9912 | 3.54 | 0.52 | 12.4 | 7 | good |
14 | 8.3 | 0.42 | 0.62 | 19.25 | 0.040 | 41.0 | 172.0 | 1.0002 | 2.98 | 0.67 | 9.7 | 5 | bad |
15 | 6.6 | 0.17 | 0.38 | 1.50 | 0.032 | 28.0 | 112.0 | 0.9914 | 3.25 | 0.55 | 11.4 | 7 | good |
16 | 6.3 | 0.48 | 0.04 | 1.10 | 0.046 | 30.0 | 99.0 | 0.9928 | 3.24 | 0.36 | 9.6 | 6 | good |
17 | 6.2 | 0.66 | 0.48 | 1.20 | 0.029 | 29.0 | 75.0 | 0.9892 | 3.33 | 0.39 | 12.8 | 8 | premium |
18 | 7.4 | 0.34 | 0.42 | 1.10 | 0.033 | 17.0 | 171.0 | 0.9917 | 3.12 | 0.53 | 11.3 | 6 | good |
19 | 6.5 | 0.31 | 0.14 | 7.50 | 0.044 | 34.0 | 133.0 | 0.9955 | 3.22 | 0.50 | 9.5 | 5 | bad |
data4 = pd.read_excel(book_reviews)
print(data4.shape)
data4.head()
(100, 8)
Title | Author | Reviews | Ratings | Synopsis | Genre | BookCategory | Price | |
---|---|---|---|---|---|---|---|---|
0 | The Prisoner's Gold (The Hunters 3) | Chris Kuzneski | 4.0 out of 5 stars | 8 customer reviews | THE HUNTERS return in their third brilliant no... | Action & Adventure (Books) | Action & Adventure | 220.00 |
1 | Guru Dutt: A Tragedy in Three Acts | Arun Khopkar | 3.9 out of 5 stars | 14 customer reviews | A layered portrait of a troubled genius for wh... | Cinema & Broadcast (Books) | Biographies, Diaries & True Accounts | 202.93 |
2 | Leviathan (Penguin Classics) | Thomas Hobbes | 9.6 out of 10 stars | 6 customer reviews | "During the time men live without a common Pow... | International Relations | Humour | 299.00 |
3 | A Pocket Full of Rye (Miss Marple) | Agatha Christie | 8.2 out of 10 stars | 13 customer reviews | A handful of grain is found in the pocket of a... | Contemporary Fiction (Books) | Crime, Thriller & Mystery | 180.00 |
4 | ChiRunning: A Revolutionary Approach to Effort... | Danny Dreyer | 4.5 out of 5 stars | 8 customer reviews | The revised edition of the bestselling ChiRunn... | Healthy Living & Wellness (Books) | Sports | 900.00 |
data4.Author.str.split(" ",n=1, expand=True)
0 | 1 | |
---|---|---|
0 | Chris | Kuzneski |
1 | Arun | Khopkar |
2 | Thomas | Hobbes |
3 | Agatha | Christie |
4 | Danny | Dreyer |
... | ... | ... |
95 | Ruskin | Bond |
96 | William | Tomkins |
97 | Leander | Kahney |
98 | Susan | Sontag |
99 | Agrawal | P.K |
100 rows × 2 columns
Author_Firstname
and Author_Lastname
¶data4['Author_Firstname'] = data4.Author.str.split(" ",n=1, expand=True)[0]
data4['Author_Lastname'] = data4.Author.str.split(" ",n=1, expand=True)[1]
data4.head()
Title | Author | Reviews | Ratings | Synopsis | Genre | BookCategory | Price | Author_Firstname | Author_Lastname | |
---|---|---|---|---|---|---|---|---|---|---|
0 | The Prisoner's Gold (The Hunters 3) | Chris Kuzneski | 4.0 out of 5 stars | 8 customer reviews | THE HUNTERS return in their third brilliant no... | Action & Adventure (Books) | Action & Adventure | 220.00 | Chris | Kuzneski |
1 | Guru Dutt: A Tragedy in Three Acts | Arun Khopkar | 3.9 out of 5 stars | 14 customer reviews | A layered portrait of a troubled genius for wh... | Cinema & Broadcast (Books) | Biographies, Diaries & True Accounts | 202.93 | Arun | Khopkar |
2 | Leviathan (Penguin Classics) | Thomas Hobbes | 9.6 out of 10 stars | 6 customer reviews | "During the time men live without a common Pow... | International Relations | Humour | 299.00 | Thomas | Hobbes |
3 | A Pocket Full of Rye (Miss Marple) | Agatha Christie | 8.2 out of 10 stars | 13 customer reviews | A handful of grain is found in the pocket of a... | Contemporary Fiction (Books) | Crime, Thriller & Mystery | 180.00 | Agatha | Christie |
4 | ChiRunning: A Revolutionary Approach to Effort... | Danny Dreyer | 4.5 out of 5 stars | 8 customer reviews | The revised edition of the bestselling ChiRunn... | Healthy Living & Wellness (Books) | Sports | 900.00 | Danny | Dreyer |
data4.Reviews.str.split(" ", expand=True)
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | 4.0 | out | of | 5 | stars |
1 | 3.9 | out | of | 5 | stars |
2 | 9.6 | out | of | 10 | stars |
3 | 8.2 | out | of | 10 | stars |
4 | 4.5 | out | of | 5 | stars |
... | ... | ... | ... | ... | ... |
95 | 5.0 | out | of | 5 | stars |
96 | 4.0 | out | of | 5 | stars |
97 | 3.5 | out | of | 5 | stars |
98 | 4.5 | out | of | 5 | stars |
99 | 5.0 | out | of | 5 | stars |
100 rows × 5 columns
data4['Reviews_percent'] = data4.Reviews.str.split(" ", expand=True)[0].astype('float')/data4.Reviews.str.split(" ", expand=True)[3].astype('float')
data.head()
area_type | availability | location | size | society | total_sqft | bath | balcony | other_costs | price | other_costs_list | total_cost | Reviews_percent | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Super built-up Area | 19-Dec | Electronic City Phase II | 2 BHK | Coomee | 1056 | 2.0 | 1.0 | custom 2.0L reg 1.76L misc 3.7L | 39.07 | [2.0, 1.76, 3.7] | 46.53 | 0.80 |
1 | Plot Area | Ready To Move | Chikka Tirupathi | 4 Bedroom | Theanmp | 2600 | 5.0 | 3.0 | custom 2.4L reg 5.4L | 120.00 | [2.4, 5.4] | 127.80 | 0.78 |
2 | Built-up Area | Ready To Move | Uttarahalli | 3 BHK | NaN | 1440 | 2.0 | 3.0 | reg 2.79L misc 3.5L | 62.00 | [2.79, 3.5] | 68.29 | 0.96 |
3 | Super built-up Area | Ready To Move | Lingadheeranahalli | 3 BHK | Soiewre | 1521 | 3.0 | 1.0 | custom 2.5L reg 4.28L pd 1.34L misc 1.6L | 95.00 | [2.5, 4.28, 1.34, 1.6] | 104.72 | 0.82 |
4 | Super built-up Area | Ready To Move | Kothanur | 2 BHK | NaN | 1200 | 2.0 | 1.0 | reg 2.3L | 51.00 | [2.3] | 53.30 | 0.90 |
from scipy import stats
data5 = pd.read_csv(mtcars)
print(data5.shape)
data5.head()
(32, 12)
car_model | mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Mazda RX4 | 21.0 | 6 | 160.0 | 110 | 3.90 | 2.620 | 16.46 | 0 | 1 | 4 | 4 |
1 | Mazda RX4 Wag | 21.0 | 6 | 160.0 | 110 | 3.90 | 2.875 | 17.02 | 0 | 1 | 4 | 4 |
2 | Datsun 710 | 22.8 | 4 | 108.0 | 93 | 3.85 | 2.320 | 18.61 | 1 | 1 | 4 | 1 |
3 | Hornet 4 Drive | 21.4 | 6 | 258.0 | 110 | 3.08 | 3.215 | 19.44 | 1 | 0 | 3 | 1 |
4 | Hornet Sportabout | 18.7 | 8 | 360.0 | 175 | 3.15 | 3.440 | 17.02 | 0 | 0 | 3 | 2 |
stats.zscore(data5.hp) # just for one value
array([-0.54365487, -0.54365487, -0.7955699 , -0.54365487, 0.41954967, -0.61774753, 1.45684686, -1.25494437, -0.76593284, -0.35101396, -0.35101396, 0.49364233, 0.49364233, 0.49364233, 0.86410561, 1.01229092, 1.23456889, -1.19567025, -1.40312969, -1.21048878, -0.73629578, 0.04908639, 0.04908639, 1.45684686, 0.41954967, -1.19567025, -0.82520696, -0.49919927, 1.73839896, 0.41954967, 2.79051468, -0.5584734 ])
data5[abs(stats.zscore(data5.hp))<2.3]
car_model | mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Mazda RX4 | 21.0 | 6 | 160.0 | 110 | 3.90 | 2.620 | 16.46 | 0 | 1 | 4 | 4 |
1 | Mazda RX4 Wag | 21.0 | 6 | 160.0 | 110 | 3.90 | 2.875 | 17.02 | 0 | 1 | 4 | 4 |
2 | Datsun 710 | 22.8 | 4 | 108.0 | 93 | 3.85 | 2.320 | 18.61 | 1 | 1 | 4 | 1 |
3 | Hornet 4 Drive | 21.4 | 6 | 258.0 | 110 | 3.08 | 3.215 | 19.44 | 1 | 0 | 3 | 1 |
4 | Hornet Sportabout | 18.7 | 8 | 360.0 | 175 | 3.15 | 3.440 | 17.02 | 0 | 0 | 3 | 2 |
5 | Valiant | 18.1 | 6 | 225.0 | 105 | 2.76 | 3.460 | 20.22 | 1 | 0 | 3 | 1 |
6 | Duster 360 | 14.3 | 8 | 360.0 | 245 | 3.21 | 3.570 | 15.84 | 0 | 0 | 3 | 4 |
7 | Merc 240D | 24.4 | 4 | 146.7 | 62 | 3.69 | 3.190 | 20.00 | 1 | 0 | 4 | 2 |
8 | Merc 230 | 22.8 | 4 | 140.8 | 95 | 3.92 | 3.150 | 22.90 | 1 | 0 | 4 | 2 |
9 | Merc 280 | 19.2 | 6 | 167.6 | 123 | 3.92 | 3.440 | 18.30 | 1 | 0 | 4 | 4 |
10 | Merc 280C | 17.8 | 6 | 167.6 | 123 | 3.92 | 3.440 | 18.90 | 1 | 0 | 4 | 4 |
11 | Merc 450SE | 16.4 | 8 | 275.8 | 180 | 3.07 | 4.070 | 17.40 | 0 | 0 | 3 | 3 |
12 | Merc 450SL | 17.3 | 8 | 275.8 | 180 | 3.07 | 3.730 | 17.60 | 0 | 0 | 3 | 3 |
13 | Merc 450SLC | 15.2 | 8 | 275.8 | 180 | 3.07 | 3.780 | 18.00 | 0 | 0 | 3 | 3 |
14 | Cadillac Fleetwood | 10.4 | 8 | 472.0 | 205 | 2.93 | 5.250 | 17.98 | 0 | 0 | 3 | 4 |
15 | Lincoln Continental | 10.4 | 8 | 460.0 | 215 | 3.00 | 5.424 | 17.82 | 0 | 0 | 3 | 4 |
16 | Chrysler Imperial | 14.7 | 8 | 440.0 | 230 | 3.23 | 5.345 | 17.42 | 0 | 0 | 3 | 4 |
17 | Fiat 128 | 32.4 | 4 | 78.7 | 66 | 4.08 | 2.200 | 19.47 | 1 | 1 | 4 | 1 |
18 | Honda Civic | 30.4 | 4 | 75.7 | 52 | 4.93 | 1.615 | 18.52 | 1 | 1 | 4 | 2 |
19 | Toyota Corolla | 33.9 | 4 | 71.1 | 65 | 4.22 | 1.835 | 19.90 | 1 | 1 | 4 | 1 |
20 | Toyota Corona | 21.5 | 4 | 120.1 | 97 | 3.70 | 2.465 | 20.01 | 1 | 0 | 3 | 1 |
21 | Dodge Challenger | 15.5 | 8 | 318.0 | 150 | 2.76 | 3.520 | 16.87 | 0 | 0 | 3 | 2 |
22 | AMC Javelin | 15.2 | 8 | 304.0 | 150 | 3.15 | 3.435 | 17.30 | 0 | 0 | 3 | 2 |
23 | Camaro Z28 | 13.3 | 8 | 350.0 | 245 | 3.73 | 3.840 | 15.41 | 0 | 0 | 3 | 4 |
24 | Pontiac Firebird | 19.2 | 8 | 400.0 | 175 | 3.08 | 3.845 | 17.05 | 0 | 0 | 3 | 2 |
25 | Fiat X1-9 | 27.3 | 4 | 79.0 | 66 | 4.08 | 1.935 | 18.90 | 1 | 1 | 4 | 1 |
26 | Porsche 914-2 | 26.0 | 4 | 120.3 | 91 | 4.43 | 2.140 | 16.70 | 0 | 1 | 5 | 2 |
27 | Lotus Europa | 30.4 | 4 | 95.1 | 113 | 3.77 | 1.513 | 16.90 | 1 | 1 | 5 | 2 |
28 | Ford Pantera L | 15.8 | 8 | 351.0 | 264 | 4.22 | 3.170 | 14.50 | 0 | 1 | 5 | 4 |
29 | Ferrari Dino | 19.7 | 6 | 145.0 | 175 | 3.62 | 2.770 | 15.50 | 0 | 1 | 5 | 6 |
31 | Volvo 142E | 21.4 | 4 | 121.0 | 109 | 4.11 | 2.780 | 18.60 | 1 | 1 | 4 | 2 |
data5[(abs(stats.zscore(data5.iloc[:,1:])<3)).all(axis=1)] # abs = absolute value
car_model | mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Mazda RX4 | 21.0 | 6 | 160.0 | 110 | 3.90 | 2.620 | 16.46 | 0 | 1 | 4 | 4 |
1 | Mazda RX4 Wag | 21.0 | 6 | 160.0 | 110 | 3.90 | 2.875 | 17.02 | 0 | 1 | 4 | 4 |
2 | Datsun 710 | 22.8 | 4 | 108.0 | 93 | 3.85 | 2.320 | 18.61 | 1 | 1 | 4 | 1 |
3 | Hornet 4 Drive | 21.4 | 6 | 258.0 | 110 | 3.08 | 3.215 | 19.44 | 1 | 0 | 3 | 1 |
4 | Hornet Sportabout | 18.7 | 8 | 360.0 | 175 | 3.15 | 3.440 | 17.02 | 0 | 0 | 3 | 2 |
5 | Valiant | 18.1 | 6 | 225.0 | 105 | 2.76 | 3.460 | 20.22 | 1 | 0 | 3 | 1 |
6 | Duster 360 | 14.3 | 8 | 360.0 | 245 | 3.21 | 3.570 | 15.84 | 0 | 0 | 3 | 4 |
7 | Merc 240D | 24.4 | 4 | 146.7 | 62 | 3.69 | 3.190 | 20.00 | 1 | 0 | 4 | 2 |
8 | Merc 230 | 22.8 | 4 | 140.8 | 95 | 3.92 | 3.150 | 22.90 | 1 | 0 | 4 | 2 |
9 | Merc 280 | 19.2 | 6 | 167.6 | 123 | 3.92 | 3.440 | 18.30 | 1 | 0 | 4 | 4 |
10 | Merc 280C | 17.8 | 6 | 167.6 | 123 | 3.92 | 3.440 | 18.90 | 1 | 0 | 4 | 4 |
11 | Merc 450SE | 16.4 | 8 | 275.8 | 180 | 3.07 | 4.070 | 17.40 | 0 | 0 | 3 | 3 |
12 | Merc 450SL | 17.3 | 8 | 275.8 | 180 | 3.07 | 3.730 | 17.60 | 0 | 0 | 3 | 3 |
13 | Merc 450SLC | 15.2 | 8 | 275.8 | 180 | 3.07 | 3.780 | 18.00 | 0 | 0 | 3 | 3 |
14 | Cadillac Fleetwood | 10.4 | 8 | 472.0 | 205 | 2.93 | 5.250 | 17.98 | 0 | 0 | 3 | 4 |
15 | Lincoln Continental | 10.4 | 8 | 460.0 | 215 | 3.00 | 5.424 | 17.82 | 0 | 0 | 3 | 4 |
16 | Chrysler Imperial | 14.7 | 8 | 440.0 | 230 | 3.23 | 5.345 | 17.42 | 0 | 0 | 3 | 4 |
17 | Fiat 128 | 32.4 | 4 | 78.7 | 66 | 4.08 | 2.200 | 19.47 | 1 | 1 | 4 | 1 |
18 | Honda Civic | 30.4 | 4 | 75.7 | 52 | 4.93 | 1.615 | 18.52 | 1 | 1 | 4 | 2 |
19 | Toyota Corolla | 33.9 | 4 | 71.1 | 65 | 4.22 | 1.835 | 19.90 | 1 | 1 | 4 | 1 |
20 | Toyota Corona | 21.5 | 4 | 120.1 | 97 | 3.70 | 2.465 | 20.01 | 1 | 0 | 3 | 1 |
21 | Dodge Challenger | 15.5 | 8 | 318.0 | 150 | 2.76 | 3.520 | 16.87 | 0 | 0 | 3 | 2 |
22 | AMC Javelin | 15.2 | 8 | 304.0 | 150 | 3.15 | 3.435 | 17.30 | 0 | 0 | 3 | 2 |
23 | Camaro Z28 | 13.3 | 8 | 350.0 | 245 | 3.73 | 3.840 | 15.41 | 0 | 0 | 3 | 4 |
24 | Pontiac Firebird | 19.2 | 8 | 400.0 | 175 | 3.08 | 3.845 | 17.05 | 0 | 0 | 3 | 2 |
25 | Fiat X1-9 | 27.3 | 4 | 79.0 | 66 | 4.08 | 1.935 | 18.90 | 1 | 1 | 4 | 1 |
26 | Porsche 914-2 | 26.0 | 4 | 120.3 | 91 | 4.43 | 2.140 | 16.70 | 0 | 1 | 5 | 2 |
27 | Lotus Europa | 30.4 | 4 | 95.1 | 113 | 3.77 | 1.513 | 16.90 | 1 | 1 | 5 | 2 |
28 | Ford Pantera L | 15.8 | 8 | 351.0 | 264 | 4.22 | 3.170 | 14.50 | 0 | 1 | 5 | 4 |
29 | Ferrari Dino | 19.7 | 6 | 145.0 | 175 | 3.62 | 2.770 | 15.50 | 0 | 1 | 5 | 6 |
31 | Volvo 142E | 21.4 | 4 | 121.0 | 109 | 4.11 | 2.780 | 18.60 | 1 | 1 | 4 | 2 |
# dropna = drop not available
data6 = pd.read_csv(mtcars_missing)
print(data6.shape)
data6.head()
(32, 12)
car_model | mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Mazda RX4 | 21.0 | 6 | 160 | 110.0 | 3.90 | 2.62 | 16.46 | 0 | 1 | 4 | 4 |
1 | Mazda RX4 Wag | 21.0 | 6 | $ | 110.0 | NaN | 2.875 | * | 0 | 1 | 4 | 4 |
2 | Datsun 710 | NaN | 4 | 108 | NaN | 3.85 | ? | 18.61 | 1 | 1 | 4 | 1 |
3 | Hornet 4 Drive | 999.0 | 6 | NaN | NaN | 3.08 | 3.215 | - | 1 | 0 | 3 | 1 |
4 | Datsun 710 | 22.8 | 4 | NaN | 93.0 | 3.85 | 2.32 | 18.61 | 1 | 1 | 4 | 1 |
data6.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 32 entries, 0 to 31 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 car_model 32 non-null object 1 mpg 31 non-null float64 2 cyl 32 non-null int64 3 disp 25 non-null object 4 hp 30 non-null float64 5 drat 31 non-null float64 6 wt 28 non-null object 7 qsec 20 non-null object 8 vs 32 non-null int64 9 am 32 non-null int64 10 gear 32 non-null int64 11 carb 32 non-null int64 dtypes: float64(3), int64(5), object(4) memory usage: 3.1+ KB
#data6.disp.sort_values()
data6.replace(['$', '?', '*', '-'],np.nan, inplace=True)
data6.head()
area_type | availability | location | size | society | total_sqft | bath | balcony | other_costs | price | other_costs_list | total_cost | Reviews_percent | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Super built-up Area | 19-Dec | Electronic City Phase II | 2 BHK | Coomee | 1056 | 2.0 | 1.0 | custom 2.0L reg 1.76L misc 3.7L | 39.07 | [2.0, 1.76, 3.7] | 46.53 | 0.80 |
1 | Plot Area | Ready To Move | Chikka Tirupathi | 4 Bedroom | Theanmp | 2600 | 5.0 | 3.0 | custom 2.4L reg 5.4L | 120.00 | [2.4, 5.4] | 127.80 | 0.78 |
2 | Built-up Area | Ready To Move | Uttarahalli | 3 BHK | NaN | 1440 | 2.0 | 3.0 | reg 2.79L misc 3.5L | 62.00 | [2.79, 3.5] | 68.29 | 0.96 |
3 | Super built-up Area | Ready To Move | Lingadheeranahalli | 3 BHK | Soiewre | 1521 | 3.0 | 1.0 | custom 2.5L reg 4.28L pd 1.34L misc 1.6L | 95.00 | [2.5, 4.28, 1.34, 1.6] | 104.72 | 0.82 |
4 | Super built-up Area | Ready To Move | Kothanur | 2 BHK | NaN | 1200 | 2.0 | 1.0 | reg 2.3L | 51.00 | [2.3] | 53.30 | 0.90 |
data6.isnull().sum()
car_model 0 mpg 1 cyl 0 disp 9 hp 2 drat 1 wt 6 qsec 14 vs 0 am 0 gear 0 carb 0 dtype: int64
data6.isnull().sum().sum()
33
# data6.dropna(how='all')
# data6.dropna(how='any')
thresh
¶len(data6)*0.6
19.2
data6.dropna(thresh=len(data6)*0.6, axis=1)
car_model | mpg | cyl | disp | hp | drat | wt | vs | am | gear | carb | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Mazda RX4 | 21.0 | 6 | 160 | 110.0 | 3.90 | 2.62 | 0 | 1 | 4 | 4 |
1 | Mazda RX4 Wag | 21.0 | 6 | NaN | 110.0 | NaN | 2.875 | 0 | 1 | 4 | 4 |
2 | Datsun 710 | NaN | 4 | 108 | NaN | 3.85 | NaN | 1 | 1 | 4 | 1 |
3 | Hornet 4 Drive | 999.0 | 6 | NaN | NaN | 3.08 | 3.215 | 1 | 0 | 3 | 1 |
4 | Datsun 710 | 22.8 | 4 | NaN | 93.0 | 3.85 | 2.32 | 1 | 1 | 4 | 1 |
5 | Valiant | 18.1 | 6 | 225 | 105.0 | 2.76 | 3.46 | 1 | 0 | 3 | 1 |
6 | Duster 360 | 14.3 | 8 | 360 | 245.0 | 3.21 | 3.57 | 0 | 0 | 3 | 4 |
7 | Valiant 2 | 18.1 | 6 | 225 | 105.0 | 2.76 | NaN | 1 | 0 | 3 | 1 |
8 | Valiant | 18.1 | 6 | 225 | 105.0 | 2.76 | 3.46 | 1 | 0 | 3 | 1 |
9 | Merc 280 | 19.2 | 6 | 167.6 | 123.0 | 3.92 | 3.44 | 1 | 0 | 4 | 4 |
10 | Merc 280C | 17.8 | 6 | 167.6 | 123.0 | 3.92 | 3.44 | 1 | 0 | 4 | 4 |
11 | Merc 450SE | 16.4 | 8 | 275.8 | 180.0 | 3.07 | 4.07 | 0 | 0 | 3 | 3 |
12 | Merc 450SL | 17.3 | 8 | 275.8 | 180.0 | 3.07 | NaN | 0 | 0 | 3 | 3 |
13 | Merc 450SLC | 15.2 | 8 | 275.8 | 180.0 | 3.07 | 3.78 | 0 | 0 | 3 | 3 |
14 | Cadillac Fleetwood | 10.4 | 8 | NaN | 205.0 | 2.93 | 5.25 | 0 | 0 | 3 | 4 |
15 | Lincoln Continental | 10.4 | 8 | 460 | 215.0 | 3.00 | 5.424 | 0 | 0 | 3 | 4 |
16 | Chrysler Imperial | 14.7 | 8 | 440 | 230.0 | 3.23 | 5.345 | 0 | 0 | 3 | 4 |
17 | Fiat 128 | 32.4 | 4 | NaN | 66.0 | 4.08 | 2.2 | 1 | 1 | 4 | 1 |
18 | Honda Civic | 30.4 | 4 | 75.7 | 52.0 | 4.93 | 1.615 | 1 | 1 | 4 | 2 |
19 | Toyota Corolla | 33.9 | 4 | 71.1 | 65.0 | 4.22 | 1.835 | 1 | 1 | 4 | 1 |
20 | Toyota Corona | 21.5 | 4 | 120.1 | 97.0 | 3.70 | 2.465 | 1 | 0 | 3 | 1 |
21 | Dodge Challenger | 15.5 | 8 | NaN | 150.0 | 2.76 | NaN | 0 | 0 | 3 | 2 |
22 | AMC Javelin | 15.2 | 8 | 304 | 150.0 | 3.15 | 3.435 | 0 | 0 | 3 | 2 |
23 | Camaro Z28 | 13.3 | 8 | 350 | 245.0 | 3.73 | 3.84 | 0 | 0 | 3 | 4 |
24 | Pontiac Firebird | 19.2 | 8 | 400 | 175.0 | 3.08 | 3.845 | 0 | 0 | 3 | 2 |
25 | Fiat X1-9 | 27.3 | 4 | NaN | 66.0 | 4.08 | NaN | 1 | 1 | 4 | 1 |
26 | Porsche 914-2 | 26.0 | 4 | 120.3 | 91.0 | 4.43 | 2.14 | 0 | 1 | 5 | 2 |
27 | Lotus Europa | 30.4 | 4 | 95.1 | 113.0 | 3.77 | NaN | 1 | 1 | 5 | 2 |
28 | Ford Pantera L | 15.8 | 8 | NaN | 264.0 | 4.22 | 3.17 | 0 | 1 | 5 | 4 |
29 | Ferrari Dino | 19.7 | 6 | 145 | 175.0 | 3.62 | 2.77 | 0 | 1 | 5 | 6 |
30 | Maserati Bora | 15.0 | 8 | NaN | 335.0 | 3.54 | 3.57 | 0 | 1 | 5 | 8 |
31 | Volvo 142E | 21.4 | 4 | 121 | 109.0 | 4.11 | 2.78 | 1 | 1 | 4 | 2 |
data7 = pd.read_csv(house_price_bangalore)
print(data7.shape)
data7.head()
(13320, 9)
area_type | availability | location | size | society | total_sqft | bath | balcony | price | |
---|---|---|---|---|---|---|---|---|---|
0 | Super built-up Area | 19-Dec | Electronic City Phase II | 2 BHK | Coomee | 1056 | 2.0 | 1.0 | 39.07 |
1 | Plot Area | Ready To Move | Chikka Tirupathi | 4 Bedroom | Theanmp | 2600 | 5.0 | 3.0 | 120.00 |
2 | Built-up Area | Ready To Move | Uttarahalli | 3 BHK | NaN | 1440 | 2.0 | 3.0 | 62.00 |
3 | Super built-up Area | Ready To Move | Lingadheeranahalli | 3 BHK | Soiewre | 1521 | 3.0 | 1.0 | 95.00 |
4 | Super built-up Area | Ready To Move | Kothanur | 2 BHK | NaN | 1200 | 2.0 | 1.0 | 51.00 |
from sklearn.preprocessing import LabelEncoder
data7.loc[:,['area_type', 'location', 'society']]
area_type | location | society | |
---|---|---|---|
0 | Super built-up Area | Electronic City Phase II | Coomee |
1 | Plot Area | Chikka Tirupathi | Theanmp |
2 | Built-up Area | Uttarahalli | NaN |
3 | Super built-up Area | Lingadheeranahalli | Soiewre |
4 | Super built-up Area | Kothanur | NaN |
... | ... | ... | ... |
13315 | Built-up Area | Whitefield | ArsiaEx |
13316 | Super built-up Area | Richards Town | NaN |
13317 | Built-up Area | Raja Rajeshwari Nagar | Mahla T |
13318 | Super built-up Area | Padmanabhanagar | SollyCl |
13319 | Super built-up Area | Doddathoguru | NaN |
13320 rows × 3 columns
data7.fillna(method='ffill',inplace=True)
data7.loc[:,['area_type', 'location', 'society']].apply(LabelEncoder().fit_transform)
area_type | location | society | |
---|---|---|---|
0 | 3 | 419 | 464 |
1 | 2 | 317 | 2439 |
2 | 0 | 1179 | 2439 |
3 | 3 | 757 | 2186 |
4 | 3 | 716 | 2186 |
... | ... | ... | ... |
13315 | 0 | 1252 | 209 |
13316 | 3 | 1004 | 209 |
13317 | 0 | 972 | 1216 |
13318 | 3 | 907 | 2205 |
13319 | 3 | 396 | 2205 |
13320 rows × 3 columns
data7.loc[:,['area_type', 'location', 'society']] = data7.loc[:,['area_type', 'location', 'society']].apply(LabelEncoder().fit_transform)
data7.head()
area_type | availability | location | size | society | total_sqft | bath | balcony | price | |
---|---|---|---|---|---|---|---|---|---|
0 | 3 | 19-Dec | 419 | 2 BHK | 464 | 1056 | 2.0 | 1.0 | 39.07 |
1 | 2 | Ready To Move | 317 | 4 Bedroom | 2439 | 2600 | 5.0 | 3.0 | 120.00 |
2 | 0 | Ready To Move | 1179 | 3 BHK | 2439 | 1440 | 2.0 | 3.0 | 62.00 |
3 | 3 | Ready To Move | 757 | 3 BHK | 2186 | 1521 | 3.0 | 1.0 | 95.00 |
4 | 3 | Ready To Move | 716 | 2 BHK | 2186 | 1200 | 2.0 | 1.0 | 51.00 |
data8= pd.read_csv(breast_cancer)
print(data8.shape)
data8.head()
(569, 31)
mean radius | mean texture | mean perimeter | mean area | mean smoothness | mean compactness | mean concavity | mean concave points | mean symmetry | mean fractal dimension | ... | worst texture | worst perimeter | worst area | worst smoothness | worst compactness | worst concavity | worst concave points | worst symmetry | worst fractal dimension | outcome | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | 0.07871 | ... | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | 1 |
1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | 0.05667 | ... | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | 1 |
2 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | 0.2069 | 0.05999 | ... | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | 1 |
3 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | 0.2597 | 0.09744 | ... | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | 1 |
4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | 0.1809 | 0.05883 | ... | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | 1 |
5 rows × 31 columns
pd.set_option('display.max_columns', 50)
data8.head()
mean radius | mean texture | mean perimeter | mean area | mean smoothness | mean compactness | mean concavity | mean concave points | mean symmetry | mean fractal dimension | radius error | texture error | perimeter error | area error | smoothness error | compactness error | concavity error | concave points error | symmetry error | fractal dimension error | worst radius | worst texture | worst perimeter | worst area | worst smoothness | worst compactness | worst concavity | worst concave points | worst symmetry | worst fractal dimension | outcome | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | 0.07871 | 1.0950 | 0.9053 | 8.589 | 153.40 | 0.006399 | 0.04904 | 0.05373 | 0.01587 | 0.03003 | 0.006193 | 25.38 | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | 1 |
1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | 0.05667 | 0.5435 | 0.7339 | 3.398 | 74.08 | 0.005225 | 0.01308 | 0.01860 | 0.01340 | 0.01389 | 0.003532 | 24.99 | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | 1 |
2 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | 0.2069 | 0.05999 | 0.7456 | 0.7869 | 4.585 | 94.03 | 0.006150 | 0.04006 | 0.03832 | 0.02058 | 0.02250 | 0.004571 | 23.57 | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | 1 |
3 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | 0.2597 | 0.09744 | 0.4956 | 1.1560 | 3.445 | 27.23 | 0.009110 | 0.07458 | 0.05661 | 0.01867 | 0.05963 | 0.009208 | 14.91 | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | 1 |
4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | 0.1809 | 0.05883 | 0.7572 | 0.7813 | 5.438 | 94.44 | 0.011490 | 0.02461 | 0.05688 | 0.01885 | 0.01756 | 0.005115 | 22.54 | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | 1 |
# pd.set_option? VIP
data9 = pd.read_csv(superstore_orders)
print(data9.shape)
data9.head()
(9994, 21)
row_id | order_id | order_date | ship_date | ship_mode | customer_id | customer_name | segment | country | city | state | postal_code | region | product_id | category | sub_category | product_name | sales | quantity | discount | profit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | CA-2017-152156 | 2017-11-08 | 2017-11-11 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | Kentucky | 42420.0 | South | FUR-BO-10001798 | Furniture | Bookcases | Bush Somerset Collection Bookcase | 261.9600 | 2 | 0.00 | 41.9136 |
1 | 2 | CA-2017-152156 | 2017-11-08 | 2017-11-11 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | Kentucky | 42420.0 | South | FUR-CH-10000454 | Furniture | Chairs | Hon Deluxe Fabric Upholstered Stacking Chairs,... | 731.9400 | 3 | 0.00 | 219.5820 |
2 | 3 | CA-2017-138688 | 2017-06-12 | 2017-06-16 | Second Class | DV-13045 | Darrin Van Huff | Corporate | United States | Los Angeles | California | 90036.0 | West | OFF-LA-10000240 | Office Supplies | Labels | Self-Adhesive Address Labels for Typewriters b... | 14.6200 | 2 | 0.00 | 6.8714 |
3 | 4 | US-2016-108966 | 2016-10-11 | 2016-10-18 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | Florida | 33311.0 | South | FUR-TA-10000577 | Furniture | Tables | Bretford CR4500 Series Slim Rectangular Table | 957.5775 | 5 | 0.45 | -383.0310 |
4 | 5 | US-2016-108966 | 2016-10-11 | 2016-10-18 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | Florida | 33311.0 | South | OFF-ST-10000760 | Office Supplies | Storage | Eldon Fold 'N Roll Cart System | 22.3680 | 2 | 0.20 | 2.5164 |
data9 = pd.read_csv(superstore_orders, parse_dates=['order_date','ship_date'])
print(data9.shape)
data9.head()
(9994, 21)
row_id | order_id | order_date | ship_date | ship_mode | customer_id | customer_name | segment | country | city | state | postal_code | region | product_id | category | sub_category | product_name | sales | quantity | discount | profit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | CA-2017-152156 | 2017-11-08 | 2017-11-11 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | Kentucky | 42420.0 | South | FUR-BO-10001798 | Furniture | Bookcases | Bush Somerset Collection Bookcase | 261.9600 | 2 | 0.00 | 41.9136 |
1 | 2 | CA-2017-152156 | 2017-11-08 | 2017-11-11 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | Kentucky | 42420.0 | South | FUR-CH-10000454 | Furniture | Chairs | Hon Deluxe Fabric Upholstered Stacking Chairs,... | 731.9400 | 3 | 0.00 | 219.5820 |
2 | 3 | CA-2017-138688 | 2017-06-12 | 2017-06-16 | Second Class | DV-13045 | Darrin Van Huff | Corporate | United States | Los Angeles | California | 90036.0 | West | OFF-LA-10000240 | Office Supplies | Labels | Self-Adhesive Address Labels for Typewriters b... | 14.6200 | 2 | 0.00 | 6.8714 |
3 | 4 | US-2016-108966 | 2016-10-11 | 2016-10-18 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | Florida | 33311.0 | South | FUR-TA-10000577 | Furniture | Tables | Bretford CR4500 Series Slim Rectangular Table | 957.5775 | 5 | 0.45 | -383.0310 |
4 | 5 | US-2016-108966 | 2016-10-11 | 2016-10-18 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | Florida | 33311.0 | South | OFF-ST-10000760 | Office Supplies | Storage | Eldon Fold 'N Roll Cart System | 22.3680 | 2 | 0.20 | 2.5164 |
data9.order_date[0]
Timestamp('2017-11-08 00:00:00')
data9.order_date[0].month
11
data.order_date[0].year
2017
data9['orderyear'] = data9.order_date.apply(lambda x: x.year)
data9['ordermonth'] = data9.order_date.apply(lambda x: x.month)
data9.head()
row_id | order_id | order_date | ship_date | ship_mode | customer_id | customer_name | segment | country | city | state | postal_code | region | product_id | category | sub_category | product_name | sales | quantity | discount | profit | orderyear | ordermonth | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | CA-2017-152156 | 2017-11-08 | 2017-11-11 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | Kentucky | 42420.0 | South | FUR-BO-10001798 | Furniture | Bookcases | Bush Somerset Collection Bookcase | 261.9600 | 2 | 0.00 | 41.9136 | 2017 | 11 |
1 | 2 | CA-2017-152156 | 2017-11-08 | 2017-11-11 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | Kentucky | 42420.0 | South | FUR-CH-10000454 | Furniture | Chairs | Hon Deluxe Fabric Upholstered Stacking Chairs,... | 731.9400 | 3 | 0.00 | 219.5820 | 2017 | 11 |
2 | 3 | CA-2017-138688 | 2017-06-12 | 2017-06-16 | Second Class | DV-13045 | Darrin Van Huff | Corporate | United States | Los Angeles | California | 90036.0 | West | OFF-LA-10000240 | Office Supplies | Labels | Self-Adhesive Address Labels for Typewriters b... | 14.6200 | 2 | 0.00 | 6.8714 | 2017 | 6 |
3 | 4 | US-2016-108966 | 2016-10-11 | 2016-10-18 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | Florida | 33311.0 | South | FUR-TA-10000577 | Furniture | Tables | Bretford CR4500 Series Slim Rectangular Table | 957.5775 | 5 | 0.45 | -383.0310 | 2016 | 10 |
4 | 5 | US-2016-108966 | 2016-10-11 | 2016-10-18 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | Florida | 33311.0 | South | OFF-ST-10000760 | Office Supplies | Storage | Eldon Fold 'N Roll Cart System | 22.3680 | 2 | 0.20 | 2.5164 | 2016 | 10 |
data9.pivot_table(columns='orderyear',index='category', values='sales', aggfunc='sum').style.background_gradient(cmap='coolwarm')
orderyear | 2015 | 2016 | 2017 | 2018 |
---|---|---|---|---|
category | ||||
Furniture | 157192.853100 | 170518.237000 | 198901.436000 | 215387.269200 |
Office Supplies | 151776.412000 | 137233.463000 | 183939.982000 | 246097.175000 |
Technology | 175278.233000 | 162780.809000 | 226364.180000 | 271730.811000 |
data9.pivot_table(columns='orderyear',index='category', values='sales', aggfunc='sum').pct_change(axis=1)*100
orderyear | 2015 | 2016 | 2017 | 2018 |
---|---|---|---|---|
category | ||||
Furniture | NaN | 8.477093 | 16.645257 | 8.288444 |
Office Supplies | NaN | -9.581824 | 34.034351 | 33.792106 |
Technology | NaN | -7.130049 | 39.060729 | 20.041435 |
data9.pivot_table(columns='orderyear',index='category', values='profit', aggfunc='sum').pct_change(axis=1)*100
orderyear | 2015 | 2016 | 2017 | 2018 |
---|---|---|---|---|
category | ||||
Furniture | NaN | -44.753489 | 130.828682 | -56.632017 |
Office Supplies | NaN | 11.092248 | 39.688767 | 13.334936 |
Technology | NaN | 55.883907 | 18.714631 | 27.430650 |
data11 = pd.read_csv(breast_cancer)
data11.head()
mean radius | mean texture | mean perimeter | mean area | mean smoothness | mean compactness | mean concavity | mean concave points | mean symmetry | mean fractal dimension | radius error | texture error | perimeter error | area error | smoothness error | compactness error | concavity error | concave points error | symmetry error | fractal dimension error | worst radius | worst texture | worst perimeter | worst area | worst smoothness | worst compactness | worst concavity | worst concave points | worst symmetry | worst fractal dimension | outcome | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | 0.07871 | 1.0950 | 0.9053 | 8.589 | 153.40 | 0.006399 | 0.04904 | 0.05373 | 0.01587 | 0.03003 | 0.006193 | 25.38 | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | 1 |
1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | 0.05667 | 0.5435 | 0.7339 | 3.398 | 74.08 | 0.005225 | 0.01308 | 0.01860 | 0.01340 | 0.01389 | 0.003532 | 24.99 | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | 1 |
2 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | 0.2069 | 0.05999 | 0.7456 | 0.7869 | 4.585 | 94.03 | 0.006150 | 0.04006 | 0.03832 | 0.02058 | 0.02250 | 0.004571 | 23.57 | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | 1 |
3 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | 0.2597 | 0.09744 | 0.4956 | 1.1560 | 3.445 | 27.23 | 0.009110 | 0.07458 | 0.05661 | 0.01867 | 0.05963 | 0.009208 | 14.91 | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | 1 |
4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | 0.1809 | 0.05883 | 0.7572 | 0.7813 | 5.438 | 94.44 | 0.011490 | 0.02461 | 0.05688 | 0.01885 | 0.01756 | 0.005115 | 22.54 | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | 1 |
data11.corr().style.background_gradient('coolwarm')
# data11.corr().style.background_gradient('twilight')
mean radius | mean texture | mean perimeter | mean area | mean smoothness | mean compactness | mean concavity | mean concave points | mean symmetry | mean fractal dimension | radius error | texture error | perimeter error | area error | smoothness error | compactness error | concavity error | concave points error | symmetry error | fractal dimension error | worst radius | worst texture | worst perimeter | worst area | worst smoothness | worst compactness | worst concavity | worst concave points | worst symmetry | worst fractal dimension | outcome | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
mean radius | 1.000000 | 0.323782 | 0.997855 | 0.987357 | 0.170581 | 0.506124 | 0.676764 | 0.822529 | 0.147741 | -0.311631 | 0.679090 | -0.097317 | 0.674172 | 0.735864 | -0.222600 | 0.206000 | 0.194204 | 0.376169 | -0.104321 | -0.042641 | 0.969539 | 0.297008 | 0.965137 | 0.941082 | 0.119616 | 0.413463 | 0.526911 | 0.744214 | 0.163953 | 0.007066 | 0.730029 |
mean texture | 0.323782 | 1.000000 | 0.329533 | 0.321086 | -0.023389 | 0.236702 | 0.302418 | 0.293464 | 0.071401 | -0.076437 | 0.275869 | 0.386358 | 0.281673 | 0.259845 | 0.006614 | 0.191975 | 0.143293 | 0.163851 | 0.009127 | 0.054458 | 0.352573 | 0.912045 | 0.358040 | 0.343546 | 0.077503 | 0.277830 | 0.301025 | 0.295316 | 0.105008 | 0.119205 | 0.415185 |
mean perimeter | 0.997855 | 0.329533 | 1.000000 | 0.986507 | 0.207278 | 0.556936 | 0.716136 | 0.850977 | 0.183027 | -0.261477 | 0.691765 | -0.086761 | 0.693135 | 0.744983 | -0.202694 | 0.250744 | 0.228082 | 0.407217 | -0.081629 | -0.005523 | 0.969476 | 0.303038 | 0.970387 | 0.941550 | 0.150549 | 0.455774 | 0.563879 | 0.771241 | 0.189115 | 0.051019 | 0.742636 |
mean area | 0.987357 | 0.321086 | 0.986507 | 1.000000 | 0.177028 | 0.498502 | 0.685983 | 0.823269 | 0.151293 | -0.283110 | 0.732562 | -0.066280 | 0.726628 | 0.800086 | -0.166777 | 0.212583 | 0.207660 | 0.372320 | -0.072497 | -0.019887 | 0.962746 | 0.287489 | 0.959120 | 0.959213 | 0.123523 | 0.390410 | 0.512606 | 0.722017 | 0.143570 | 0.003738 | 0.708984 |
mean smoothness | 0.170581 | -0.023389 | 0.207278 | 0.177028 | 1.000000 | 0.659123 | 0.521984 | 0.553695 | 0.557775 | 0.584792 | 0.301467 | 0.068406 | 0.296092 | 0.246552 | 0.332375 | 0.318943 | 0.248396 | 0.380676 | 0.200774 | 0.283607 | 0.213120 | 0.036072 | 0.238853 | 0.206718 | 0.805324 | 0.472468 | 0.434926 | 0.503053 | 0.394309 | 0.499316 | 0.358560 |
mean compactness | 0.506124 | 0.236702 | 0.556936 | 0.498502 | 0.659123 | 1.000000 | 0.883121 | 0.831135 | 0.602641 | 0.565369 | 0.497473 | 0.046205 | 0.548905 | 0.455653 | 0.135299 | 0.738722 | 0.570517 | 0.642262 | 0.229977 | 0.507318 | 0.535315 | 0.248133 | 0.590210 | 0.509604 | 0.565541 | 0.865809 | 0.816275 | 0.815573 | 0.510223 | 0.687382 | 0.596534 |
mean concavity | 0.676764 | 0.302418 | 0.716136 | 0.685983 | 0.521984 | 0.883121 | 1.000000 | 0.921391 | 0.500667 | 0.336783 | 0.631925 | 0.076218 | 0.660391 | 0.617427 | 0.098564 | 0.670279 | 0.691270 | 0.683260 | 0.178009 | 0.449301 | 0.688236 | 0.299879 | 0.729565 | 0.675987 | 0.448822 | 0.754968 | 0.884103 | 0.861323 | 0.409464 | 0.514930 | 0.696360 |
mean concave points | 0.822529 | 0.293464 | 0.850977 | 0.823269 | 0.553695 | 0.831135 | 0.921391 | 1.000000 | 0.462497 | 0.166917 | 0.698050 | 0.021480 | 0.710650 | 0.690299 | 0.027653 | 0.490424 | 0.439167 | 0.615634 | 0.095351 | 0.257584 | 0.830318 | 0.292752 | 0.855923 | 0.809630 | 0.452753 | 0.667454 | 0.752399 | 0.910155 | 0.375744 | 0.368661 | 0.776614 |
mean symmetry | 0.147741 | 0.071401 | 0.183027 | 0.151293 | 0.557775 | 0.602641 | 0.500667 | 0.462497 | 1.000000 | 0.479921 | 0.303379 | 0.128053 | 0.313893 | 0.223970 | 0.187321 | 0.421659 | 0.342627 | 0.393298 | 0.449137 | 0.331786 | 0.185728 | 0.090651 | 0.219169 | 0.177193 | 0.426675 | 0.473200 | 0.433721 | 0.430297 | 0.699826 | 0.438413 | 0.330499 |
mean fractal dimension | -0.311631 | -0.076437 | -0.261477 | -0.283110 | 0.584792 | 0.565369 | 0.336783 | 0.166917 | 0.479921 | 1.000000 | 0.000111 | 0.164174 | 0.039830 | -0.090170 | 0.401964 | 0.559837 | 0.446630 | 0.341198 | 0.345007 | 0.688132 | -0.253691 | -0.051269 | -0.205151 | -0.231854 | 0.504942 | 0.458798 | 0.346234 | 0.175325 | 0.334019 | 0.767297 | -0.012838 |
radius error | 0.679090 | 0.275869 | 0.691765 | 0.732562 | 0.301467 | 0.497473 | 0.631925 | 0.698050 | 0.303379 | 0.000111 | 1.000000 | 0.213247 | 0.972794 | 0.951830 | 0.164514 | 0.356065 | 0.332358 | 0.513346 | 0.240567 | 0.227754 | 0.715065 | 0.194799 | 0.719684 | 0.751548 | 0.141919 | 0.287103 | 0.380585 | 0.531062 | 0.094543 | 0.049559 | 0.567134 |
texture error | -0.097317 | 0.386358 | -0.086761 | -0.066280 | 0.068406 | 0.046205 | 0.076218 | 0.021480 | 0.128053 | 0.164174 | 0.213247 | 1.000000 | 0.223171 | 0.111567 | 0.397243 | 0.231700 | 0.194998 | 0.230283 | 0.411621 | 0.279723 | -0.111690 | 0.409003 | -0.102242 | -0.083195 | -0.073658 | -0.092439 | -0.068956 | -0.119638 | -0.128215 | -0.045655 | -0.008303 |
perimeter error | 0.674172 | 0.281673 | 0.693135 | 0.726628 | 0.296092 | 0.548905 | 0.660391 | 0.710650 | 0.313893 | 0.039830 | 0.972794 | 0.223171 | 1.000000 | 0.937655 | 0.151075 | 0.416322 | 0.362482 | 0.556264 | 0.266487 | 0.244143 | 0.697201 | 0.200371 | 0.721031 | 0.730713 | 0.130054 | 0.341919 | 0.418899 | 0.554897 | 0.109930 | 0.085433 | 0.556141 |
area error | 0.735864 | 0.259845 | 0.744983 | 0.800086 | 0.246552 | 0.455653 | 0.617427 | 0.690299 | 0.223970 | -0.090170 | 0.951830 | 0.111567 | 0.937655 | 1.000000 | 0.075150 | 0.284840 | 0.270895 | 0.415730 | 0.134109 | 0.127071 | 0.757373 | 0.196497 | 0.761213 | 0.811408 | 0.125389 | 0.283257 | 0.385100 | 0.538166 | 0.074126 | 0.017539 | 0.548236 |
smoothness error | -0.222600 | 0.006614 | -0.202694 | -0.166777 | 0.332375 | 0.135299 | 0.098564 | 0.027653 | 0.187321 | 0.401964 | 0.164514 | 0.397243 | 0.151075 | 0.075150 | 1.000000 | 0.336696 | 0.268685 | 0.328429 | 0.413506 | 0.427374 | -0.230691 | -0.074743 | -0.217304 | -0.182195 | 0.314457 | -0.055558 | -0.058298 | -0.102007 | -0.107342 | 0.101480 | -0.067016 |
compactness error | 0.206000 | 0.191975 | 0.250744 | 0.212583 | 0.318943 | 0.738722 | 0.670279 | 0.490424 | 0.421659 | 0.559837 | 0.356065 | 0.231700 | 0.416322 | 0.284840 | 0.336696 | 1.000000 | 0.801268 | 0.744083 | 0.394713 | 0.803269 | 0.204607 | 0.143003 | 0.260516 | 0.199371 | 0.227394 | 0.678780 | 0.639147 | 0.483208 | 0.277878 | 0.590973 | 0.292999 |
concavity error | 0.194204 | 0.143293 | 0.228082 | 0.207660 | 0.248396 | 0.570517 | 0.691270 | 0.439167 | 0.342627 | 0.446630 | 0.332358 | 0.194998 | 0.362482 | 0.270895 | 0.268685 | 0.801268 | 1.000000 | 0.771804 | 0.309429 | 0.727372 | 0.186904 | 0.100241 | 0.226680 | 0.188353 | 0.168481 | 0.484858 | 0.662564 | 0.440472 | 0.197788 | 0.439329 | 0.253730 |
concave points error | 0.376169 | 0.163851 | 0.407217 | 0.372320 | 0.380676 | 0.642262 | 0.683260 | 0.615634 | 0.393298 | 0.341198 | 0.513346 | 0.230283 | 0.556264 | 0.415730 | 0.328429 | 0.744083 | 0.771804 | 1.000000 | 0.312780 | 0.611044 | 0.358127 | 0.086741 | 0.394999 | 0.342271 | 0.215351 | 0.452888 | 0.549592 | 0.602450 | 0.143116 | 0.310655 | 0.408042 |
symmetry error | -0.104321 | 0.009127 | -0.081629 | -0.072497 | 0.200774 | 0.229977 | 0.178009 | 0.095351 | 0.449137 | 0.345007 | 0.240567 | 0.411621 | 0.266487 | 0.134109 | 0.413506 | 0.394713 | 0.309429 | 0.312780 | 1.000000 | 0.369078 | -0.128121 | -0.077473 | -0.103753 | -0.110343 | -0.012662 | 0.060255 | 0.037119 | -0.030413 | 0.389402 | 0.078079 | -0.006522 |
fractal dimension error | -0.042641 | 0.054458 | -0.005523 | -0.019887 | 0.283607 | 0.507318 | 0.449301 | 0.257584 | 0.331786 | 0.688132 | 0.227754 | 0.279723 | 0.244143 | 0.127071 | 0.427374 | 0.803269 | 0.727372 | 0.611044 | 0.369078 | 1.000000 | -0.037488 | -0.003195 | -0.001000 | -0.022736 | 0.170568 | 0.390159 | 0.379975 | 0.215204 | 0.111094 | 0.591328 | 0.077972 |
worst radius | 0.969539 | 0.352573 | 0.969476 | 0.962746 | 0.213120 | 0.535315 | 0.688236 | 0.830318 | 0.185728 | -0.253691 | 0.715065 | -0.111690 | 0.697201 | 0.757373 | -0.230691 | 0.204607 | 0.186904 | 0.358127 | -0.128121 | -0.037488 | 1.000000 | 0.359921 | 0.993708 | 0.984015 | 0.216574 | 0.475820 | 0.573975 | 0.787424 | 0.243529 | 0.093492 | 0.776454 |
worst texture | 0.297008 | 0.912045 | 0.303038 | 0.287489 | 0.036072 | 0.248133 | 0.299879 | 0.292752 | 0.090651 | -0.051269 | 0.194799 | 0.409003 | 0.200371 | 0.196497 | -0.074743 | 0.143003 | 0.100241 | 0.086741 | -0.077473 | -0.003195 | 0.359921 | 1.000000 | 0.365098 | 0.345842 | 0.225429 | 0.360832 | 0.368366 | 0.359755 | 0.233027 | 0.219122 | 0.456903 |
worst perimeter | 0.965137 | 0.358040 | 0.970387 | 0.959120 | 0.238853 | 0.590210 | 0.729565 | 0.855923 | 0.219169 | -0.205151 | 0.719684 | -0.102242 | 0.721031 | 0.761213 | -0.217304 | 0.260516 | 0.226680 | 0.394999 | -0.103753 | -0.001000 | 0.993708 | 0.365098 | 1.000000 | 0.977578 | 0.236775 | 0.529408 | 0.618344 | 0.816322 | 0.269493 | 0.138957 | 0.782914 |
worst area | 0.941082 | 0.343546 | 0.941550 | 0.959213 | 0.206718 | 0.509604 | 0.675987 | 0.809630 | 0.177193 | -0.231854 | 0.751548 | -0.083195 | 0.730713 | 0.811408 | -0.182195 | 0.199371 | 0.188353 | 0.342271 | -0.110343 | -0.022736 | 0.984015 | 0.345842 | 0.977578 | 1.000000 | 0.209145 | 0.438296 | 0.543331 | 0.747419 | 0.209146 | 0.079647 | 0.733825 |
worst smoothness | 0.119616 | 0.077503 | 0.150549 | 0.123523 | 0.805324 | 0.565541 | 0.448822 | 0.452753 | 0.426675 | 0.504942 | 0.141919 | -0.073658 | 0.130054 | 0.125389 | 0.314457 | 0.227394 | 0.168481 | 0.215351 | -0.012662 | 0.170568 | 0.216574 | 0.225429 | 0.236775 | 0.209145 | 1.000000 | 0.568187 | 0.518523 | 0.547691 | 0.493838 | 0.617624 | 0.421465 |
worst compactness | 0.413463 | 0.277830 | 0.455774 | 0.390410 | 0.472468 | 0.865809 | 0.754968 | 0.667454 | 0.473200 | 0.458798 | 0.287103 | -0.092439 | 0.341919 | 0.283257 | -0.055558 | 0.678780 | 0.484858 | 0.452888 | 0.060255 | 0.390159 | 0.475820 | 0.360832 | 0.529408 | 0.438296 | 0.568187 | 1.000000 | 0.892261 | 0.801080 | 0.614441 | 0.810455 | 0.590998 |
worst concavity | 0.526911 | 0.301025 | 0.563879 | 0.512606 | 0.434926 | 0.816275 | 0.884103 | 0.752399 | 0.433721 | 0.346234 | 0.380585 | -0.068956 | 0.418899 | 0.385100 | -0.058298 | 0.639147 | 0.662564 | 0.549592 | 0.037119 | 0.379975 | 0.573975 | 0.368366 | 0.618344 | 0.543331 | 0.518523 | 0.892261 | 1.000000 | 0.855434 | 0.532520 | 0.686511 | 0.659610 |
worst concave points | 0.744214 | 0.295316 | 0.771241 | 0.722017 | 0.503053 | 0.815573 | 0.861323 | 0.910155 | 0.430297 | 0.175325 | 0.531062 | -0.119638 | 0.554897 | 0.538166 | -0.102007 | 0.483208 | 0.440472 | 0.602450 | -0.030413 | 0.215204 | 0.787424 | 0.359755 | 0.816322 | 0.747419 | 0.547691 | 0.801080 | 0.855434 | 1.000000 | 0.502528 | 0.511114 | 0.793566 |
worst symmetry | 0.163953 | 0.105008 | 0.189115 | 0.143570 | 0.394309 | 0.510223 | 0.409464 | 0.375744 | 0.699826 | 0.334019 | 0.094543 | -0.128215 | 0.109930 | 0.074126 | -0.107342 | 0.277878 | 0.197788 | 0.143116 | 0.389402 | 0.111094 | 0.243529 | 0.233027 | 0.269493 | 0.209146 | 0.493838 | 0.614441 | 0.532520 | 0.502528 | 1.000000 | 0.537848 | 0.416294 |
worst fractal dimension | 0.007066 | 0.119205 | 0.051019 | 0.003738 | 0.499316 | 0.687382 | 0.514930 | 0.368661 | 0.438413 | 0.767297 | 0.049559 | -0.045655 | 0.085433 | 0.017539 | 0.101480 | 0.590973 | 0.439329 | 0.310655 | 0.078079 | 0.591328 | 0.093492 | 0.219122 | 0.138957 | 0.079647 | 0.617624 | 0.810455 | 0.686511 | 0.511114 | 0.537848 | 1.000000 | 0.323872 |
outcome | 0.730029 | 0.415185 | 0.742636 | 0.708984 | 0.358560 | 0.596534 | 0.696360 | 0.776614 | 0.330499 | -0.012838 | 0.567134 | -0.008303 | 0.556141 | 0.548236 | -0.067016 | 0.292999 | 0.253730 | 0.408042 | -0.006522 | 0.077972 | 0.776454 | 0.456903 | 0.782914 | 0.733825 | 0.421465 | 0.590998 | 0.659610 | 0.793566 | 0.416294 | 0.323872 | 1.000000 |
data12 = pd.read_csv(superstore_orders)
data12.head()
row_id | order_id | order_date | ship_date | ship_mode | customer_id | customer_name | segment | country | city | state | postal_code | region | product_id | category | sub_category | product_name | sales | quantity | discount | profit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | CA-2017-152156 | 2017-11-08 | 2017-11-11 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | Kentucky | 42420.0 | South | FUR-BO-10001798 | Furniture | Bookcases | Bush Somerset Collection Bookcase | 261.9600 | 2 | 0.00 | 41.9136 |
1 | 2 | CA-2017-152156 | 2017-11-08 | 2017-11-11 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | Kentucky | 42420.0 | South | FUR-CH-10000454 | Furniture | Chairs | Hon Deluxe Fabric Upholstered Stacking Chairs,... | 731.9400 | 3 | 0.00 | 219.5820 |
2 | 3 | CA-2017-138688 | 2017-06-12 | 2017-06-16 | Second Class | DV-13045 | Darrin Van Huff | Corporate | United States | Los Angeles | California | 90036.0 | West | OFF-LA-10000240 | Office Supplies | Labels | Self-Adhesive Address Labels for Typewriters b... | 14.6200 | 2 | 0.00 | 6.8714 |
3 | 4 | US-2016-108966 | 2016-10-11 | 2016-10-18 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | Florida | 33311.0 | South | FUR-TA-10000577 | Furniture | Tables | Bretford CR4500 Series Slim Rectangular Table | 957.5775 | 5 | 0.45 | -383.0310 |
4 | 5 | US-2016-108966 | 2016-10-11 | 2016-10-18 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | Florida | 33311.0 | South | OFF-ST-10000760 | Office Supplies | Storage | Eldon Fold 'N Roll Cart System | 22.3680 | 2 | 0.20 | 2.5164 |
data12 = pd.read_csv(superstore_orders, dtype= {'postal_code':'str'})
data12.head()
row_id | order_id | order_date | ship_date | ship_mode | customer_id | customer_name | segment | country | city | state | postal_code | region | product_id | category | sub_category | product_name | sales | quantity | discount | profit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | CA-2017-152156 | 2017-11-08 | 2017-11-11 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | Kentucky | 42420 | South | FUR-BO-10001798 | Furniture | Bookcases | Bush Somerset Collection Bookcase | 261.9600 | 2 | 0.00 | 41.9136 |
1 | 2 | CA-2017-152156 | 2017-11-08 | 2017-11-11 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | Kentucky | 42420 | South | FUR-CH-10000454 | Furniture | Chairs | Hon Deluxe Fabric Upholstered Stacking Chairs,... | 731.9400 | 3 | 0.00 | 219.5820 |
2 | 3 | CA-2017-138688 | 2017-06-12 | 2017-06-16 | Second Class | DV-13045 | Darrin Van Huff | Corporate | United States | Los Angeles | California | 90036 | West | OFF-LA-10000240 | Office Supplies | Labels | Self-Adhesive Address Labels for Typewriters b... | 14.6200 | 2 | 0.00 | 6.8714 |
3 | 4 | US-2016-108966 | 2016-10-11 | 2016-10-18 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | Florida | 33311 | South | FUR-TA-10000577 | Furniture | Tables | Bretford CR4500 Series Slim Rectangular Table | 957.5775 | 5 | 0.45 | -383.0310 |
4 | 5 | US-2016-108966 | 2016-10-11 | 2016-10-18 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | Florida | 33311 | South | OFF-ST-10000760 | Office Supplies | Storage | Eldon Fold 'N Roll Cart System | 22.3680 | 2 | 0.20 | 2.5164 |
data12.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9994 entries, 0 to 9993 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 row_id 9994 non-null int64 1 order_id 9994 non-null object 2 order_date 9994 non-null object 3 ship_date 9994 non-null object 4 ship_mode 9994 non-null object 5 customer_id 9994 non-null object 6 customer_name 9994 non-null object 7 segment 9994 non-null object 8 country 9994 non-null object 9 city 9994 non-null object 10 state 9994 non-null object 11 postal_code 9983 non-null object 12 region 9994 non-null object 13 product_id 9994 non-null object 14 category 9994 non-null object 15 sub_category 9994 non-null object 16 product_name 9994 non-null object 17 sales 9994 non-null float64 18 quantity 9994 non-null int64 19 discount 9994 non-null float64 20 profit 9994 non-null float64 dtypes: float64(3), int64(2), object(16) memory usage: 1.6+ MB
data13 = pd.read_csv(car_sales)
data13.head()
Month | Sales | |
---|---|---|
0 | 2016-01 | 266 |
1 | 2016-02 | 146 |
2 | 2016-03 | 183 |
3 | 2016-04 | 119 |
4 | 2016-05 | 180 |
data13 = pd.read_csv(car_sales,index_col='Month',parse_dates=['Month'])
data13.head()
Sales | |
---|---|
Month | |
2016-01-01 | 266 |
2016-02-01 | 146 |
2016-03-01 | 183 |
2016-04-01 | 119 |
2016-05-01 | 180 |
data13.plot();
data14 = pd.read_csv(adult)
data14.head()
age | workclass | fnlwgt | education | educationNum | marital | occupation | relationship | race | sex | capGain | capLoss | HoursWeek | NativeCountry | category | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
data = pd.read_csv(adult, usecols=['age','workclass','education','race'])
data.head()
age | workclass | education | race | |
---|---|---|---|---|
0 | 39 | State-gov | Bachelors | White |
1 | 50 | Self-emp-not-inc | Bachelors | White |
2 | 38 | Private | HS-grad | White |
3 | 53 | Private | 11th | Black |
4 | 28 | Private | Bachelors | Black |
data15 = pd.read_csv(mtcars_missing)
data15.head()
car_model | mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Mazda RX4 | 21.0 | 6 | 160 | 110.0 | 3.90 | 2.62 | 16.46 | 0 | 1 | 4 | 4 |
1 | Mazda RX4 Wag | 21.0 | 6 | $ | 110.0 | NaN | 2.875 | * | 0 | 1 | 4 | 4 |
2 | Datsun 710 | NaN | 4 | 108 | NaN | 3.85 | ? | 18.61 | 1 | 1 | 4 | 1 |
3 | Hornet 4 Drive | 999.0 | 6 | NaN | NaN | 3.08 | 3.215 | - | 1 | 0 | 3 | 1 |
4 | Datsun 710 | 22.8 | 4 | NaN | 93.0 | 3.85 | 2.32 | 18.61 | 1 | 1 | 4 | 1 |
data15.replace(['$','*','-','?'], np.nan, inplace=True)
data15.head()
car_model | mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Mazda RX4 | 21.0 | 6 | 160 | 110.0 | 3.90 | 2.62 | 16.46 | 0 | 1 | 4 | 4 |
1 | Mazda RX4 Wag | 21.0 | 6 | NaN | 110.0 | NaN | 2.875 | NaN | 0 | 1 | 4 | 4 |
2 | Datsun 710 | NaN | 4 | 108 | NaN | 3.85 | NaN | 18.61 | 1 | 1 | 4 | 1 |
3 | Hornet 4 Drive | 999.0 | 6 | NaN | NaN | 3.08 | 3.215 | NaN | 1 | 0 | 3 | 1 |
4 | Datsun 710 | 22.8 | 4 | NaN | 93.0 | 3.85 | 2.32 | 18.61 | 1 | 1 | 4 | 1 |
data15.isnull().sum()
33
data15.shape
(32, 12)
(data15.shape[0]-data15.dropna().shape[0])*100/data15.shape[0]
65.625
data16 = pd.read_csv('log-15feb18.txt')
print(data16.shape)
data16.head()
(20, 4)
IP | Time | URL | Staus | |
---|---|---|---|---|
0 | 10.128.2.1 | 15/Feb/2018:03:10:38 | GET /home.php HTTP/1.1 | 302 |
1 | 10.128.2.1 | 15/Feb/2018:03:10:38 | GET /login.php HTTP/1.1 | 200 |
2 | 10.128.2.1 | 15/Feb/2018:03:10:39 | GET /css/bootstrap.min.css HTTP/1.1 | 200 |
3 | 10.131.0.1 | 15/Feb/2018:03:10:39 | GET /css/font-awesome.min.css HTTP/1.1 | 200 |
4 | 10.130.2.1 | 15/Feb/2018:03:10:39 | GET /css/normalize.css HTTP/1.1 | 200 |
from glob import glob
glob('*')
['00 Top 40 des astuces utiles pandas Partie 1.ipynb', 'adult10.data', 'book_price_data.xlsx', 'book_reviews_price.xlsx', 'breast_cancer.csv', 'car-sales.csv', 'house_price_bangalore.csv', 'hp_other_costs.csv', 'log-15feb18.txt', 'log-16feb18.txt', 'log-17feb18.txt', 'log-18feb18.txt', 'log-19feb18.txt', 'mtcars.csv', 'mtcars_missing.csv', 'Pandas Tutorial - Top 40 Useful Tricks.ipynb', 'sales_returns.csv', 'superstore_orders.csv', 'winequality.csv']
glob('log*')
['log-15feb18.txt', 'log-16feb18.txt', 'log-17feb18.txt', 'log-18feb18.txt', 'log-19feb18.txt']
files = glob('log*')
files.sort()
files
['log-15feb18.txt', 'log-16feb18.txt', 'log-17feb18.txt', 'log-18feb18.txt', 'log-19feb18.txt']
data16 = pd.concat( (pd.read_csv(file) for file in files), ignore_index=True)
data16.head(25)
IP | Time | URL | Staus | |
---|---|---|---|---|
0 | 10.128.2.1 | 15/Feb/2018:03:10:38 | GET /home.php HTTP/1.1 | 302 |
1 | 10.128.2.1 | 15/Feb/2018:03:10:38 | GET /login.php HTTP/1.1 | 200 |
2 | 10.128.2.1 | 15/Feb/2018:03:10:39 | GET /css/bootstrap.min.css HTTP/1.1 | 200 |
3 | 10.131.0.1 | 15/Feb/2018:03:10:39 | GET /css/font-awesome.min.css HTTP/1.1 | 200 |
4 | 10.130.2.1 | 15/Feb/2018:03:10:39 | GET /css/normalize.css HTTP/1.1 | 200 |
5 | 10.128.2.1 | 15/Feb/2018:03:10:39 | GET /css/main.css HTTP/1.1 | 200 |
6 | 10.131.0.1 | 15/Feb/2018:03:10:39 | GET /css/style.css HTTP/1.1 | 200 |
7 | 10.130.2.1 | 15/Feb/2018:03:10:39 | GET /js/vendor/modernizr-2.8.3.min.js HTTP/1.1 | 200 |
8 | 10.130.2.1 | 15/Feb/2018:03:10:39 | GET /js/vendor/jquery-1.12.0.min.js HTTP/1.1 | 200 |
9 | 10.131.0.1 | 15/Feb/2018:03:10:39 | GET /bootstrap-3.3.7/js/bootstrap.min.js HTTP/1.1 | 200 |
10 | 10.131.0.1 | 15/Feb/2018:03:10:41 | GET /fonts/fontawesome-webfont.woff2?v=4.6.3 H... | 200 |
11 | 10.130.2.1 | 15/Feb/2018:03:10:42 | GET /img/ruet.png HTTP/1.1 | 200 |
12 | 10.130.2.1 | 15/Feb/2018:03:10:42 | GET / HTTP/1.1 | 302 |
13 | 10.130.2.1 | 15/Feb/2018:13:22:04 | GET /login.php HTTP/1.1 | 200 |
14 | 10.128.2.1 | 15/Feb/2018:17:06:12 | GET / HTTP/1.1 | 302 |
15 | 10.128.2.1 | 15/Feb/2018:17:06:12 | GET /login.php HTTP/1.1 | 200 |
16 | 10.128.2.1 | 15/Feb/2018:17:06:12 | GET /img/ruet.png HTTP/1.1 | 200 |
17 | 10.128.2.1 | 15/Feb/2018:22:19:51 | GET /robots.txt HTTP/1.1 | 404 |
18 | 10.128.2.1 | 15/Feb/2018:22:19:55 | GET / HTTP/1.1 | 302 |
19 | 10.130.2.1 | 15/Feb/2018:22:19:59 | GET /login.php HTTP/1.1 | 200 |
20 | 10.130.2.1 | 16/Feb/2018:04:47:15 | GET /robots.txt HTTP/1.1 | 404 |
21 | 10.128.2.1 | 16/Feb/2018:11:37:22 | GET /robots.txt HTTP/1.1 | 404 |
22 | 10.128.2.1 | 16/Feb/2018:11:37:26 | GET / HTTP/1.1 | 302 |
23 | 10.130.2.1 | 16/Feb/2018:11:37:30 | GET /login.php HTTP/1.1 | 200 |
24 | 10.130.2.1 | 16/Feb/2018:11:40:26 | GET /home.php HTTP/1.1 | 302 |
data16.shape
(184, 4)
data17 = pd.read_csv(adult)
data17.head()
age | workclass | fnlwgt | education | educationNum | marital | occupation | relationship | race | sex | capGain | capLoss | HoursWeek | NativeCountry | category | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
data17.select_dtypes(include='object')
workclass | education | marital | occupation | relationship | race | sex | NativeCountry | category | |
---|---|---|---|---|---|---|---|---|---|
0 | State-gov | Bachelors | Never-married | Adm-clerical | Not-in-family | White | Male | United-States | <=50K |
1 | Self-emp-not-inc | Bachelors | Married-civ-spouse | Exec-managerial | Husband | White | Male | United-States | <=50K |
2 | Private | HS-grad | Divorced | Handlers-cleaners | Not-in-family | White | Male | United-States | <=50K |
3 | Private | 11th | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | United-States | <=50K |
4 | Private | Bachelors | Married-civ-spouse | Prof-specialty | Wife | Black | Female | Cuba | <=50K |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
32556 | Private | Assoc-acdm | Married-civ-spouse | Tech-support | Wife | White | Female | United-States | <=50K |
32557 | Private | HS-grad | Married-civ-spouse | Machine-op-inspct | Husband | White | Male | United-States | >50K |
32558 | Private | HS-grad | Widowed | Adm-clerical | Unmarried | White | Female | United-States | <=50K |
32559 | Private | HS-grad | Never-married | Adm-clerical | Own-child | White | Male | United-States | <=50K |
32560 | Self-emp-inc | HS-grad | Married-civ-spouse | Exec-managerial | Wife | White | Female | United-States | >50K |
32561 rows × 9 columns
data18 = pd.concat((data17.select_dtypes(include='object'),data17.select_dtypes(exclude='object')),axis=1)
data18.head()
workclass | education | marital | occupation | relationship | race | sex | NativeCountry | category | age | fnlwgt | educationNum | capGain | capLoss | HoursWeek | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | State-gov | Bachelors | Never-married | Adm-clerical | Not-in-family | White | Male | United-States | <=50K | 39 | 77516 | 13 | 2174 | 0 | 40 |
1 | Self-emp-not-inc | Bachelors | Married-civ-spouse | Exec-managerial | Husband | White | Male | United-States | <=50K | 50 | 83311 | 13 | 0 | 0 | 13 |
2 | Private | HS-grad | Divorced | Handlers-cleaners | Not-in-family | White | Male | United-States | <=50K | 38 | 215646 | 9 | 0 | 0 | 40 |
3 | Private | 11th | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | United-States | <=50K | 53 | 234721 | 7 | 0 | 0 | 40 |
4 | Private | Bachelors | Married-civ-spouse | Prof-specialty | Wife | Black | Female | Cuba | <=50K | 28 | 338409 | 13 | 0 | 0 | 40 |
from sklearn.preprocessing import LabelEncoder
data18.loc[:, 'workclass':'category'] = data18.loc[:, 'workclass':'category'].apply(LabelEncoder().fit_transform)
data18.head()
workclass | education | marital | occupation | relationship | race | sex | NativeCountry | category | age | fnlwgt | educationNum | capGain | capLoss | HoursWeek | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7 | 9 | 4 | 1 | 1 | 4 | 1 | 39 | 0 | 39 | 77516 | 13 | 2174 | 0 | 40 |
1 | 6 | 9 | 2 | 4 | 0 | 4 | 1 | 39 | 0 | 50 | 83311 | 13 | 0 | 0 | 13 |
2 | 4 | 11 | 0 | 6 | 1 | 4 | 1 | 39 | 0 | 38 | 215646 | 9 | 0 | 0 | 40 |
3 | 4 | 1 | 2 | 6 | 0 | 2 | 1 | 39 | 0 | 53 | 234721 | 7 | 0 | 0 | 40 |
4 | 4 | 9 | 2 | 10 | 5 | 2 | 0 | 5 | 0 | 28 | 338409 | 13 | 0 | 0 | 40 |
data19 = pd.read_csv(breast_cancer)
print(data19.shape)
data19.head()
(569, 31)
mean radius | mean texture | mean perimeter | mean area | mean smoothness | mean compactness | mean concavity | mean concave points | mean symmetry | mean fractal dimension | radius error | texture error | perimeter error | area error | smoothness error | compactness error | concavity error | concave points error | symmetry error | fractal dimension error | worst radius | worst texture | worst perimeter | worst area | worst smoothness | worst compactness | worst concavity | worst concave points | worst symmetry | worst fractal dimension | outcome | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | 0.07871 | 1.0950 | 0.9053 | 8.589 | 153.40 | 0.006399 | 0.04904 | 0.05373 | 0.01587 | 0.03003 | 0.006193 | 25.38 | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | 1 |
1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | 0.05667 | 0.5435 | 0.7339 | 3.398 | 74.08 | 0.005225 | 0.01308 | 0.01860 | 0.01340 | 0.01389 | 0.003532 | 24.99 | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | 1 |
2 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | 0.2069 | 0.05999 | 0.7456 | 0.7869 | 4.585 | 94.03 | 0.006150 | 0.04006 | 0.03832 | 0.02058 | 0.02250 | 0.004571 | 23.57 | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | 1 |
3 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | 0.2597 | 0.09744 | 0.4956 | 1.1560 | 3.445 | 27.23 | 0.009110 | 0.07458 | 0.05661 | 0.01867 | 0.05963 | 0.009208 | 14.91 | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | 1 |
4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | 0.1809 | 0.05883 | 0.7572 | 0.7813 | 5.438 | 94.44 | 0.011490 | 0.02461 | 0.05688 | 0.01885 | 0.01756 | 0.005115 | 22.54 | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | 1 |
data_corr = data19.corr()
data_corr.outcome*100
mean radius 73.002851 mean texture 41.518530 mean perimeter 74.263553 mean area 70.898384 mean smoothness 35.855997 mean compactness 59.653368 mean concavity 69.635971 mean concave points 77.661384 mean symmetry 33.049855 mean fractal dimension -1.283760 radius error 56.713382 texture error -0.830333 perimeter error 55.614070 area error 54.823594 smoothness error -6.701601 compactness error 29.299924 concavity error 25.372977 concave points error 40.804233 symmetry error -0.652176 fractal dimension error 7.797242 worst radius 77.645378 worst texture 45.690282 worst perimeter 78.291414 worst area 73.382503 worst smoothness 42.146486 worst compactness 59.099824 worst concavity 65.961021 worst concave points 79.356602 worst symmetry 41.629431 worst fractal dimension 32.387219 outcome 100.000000 Name: outcome, dtype: float64
data_corr.outcome.sort_values(ascending=False)
outcome 1.000000 worst concave points 0.793566 worst perimeter 0.782914 mean concave points 0.776614 worst radius 0.776454 mean perimeter 0.742636 worst area 0.733825 mean radius 0.730029 mean area 0.708984 mean concavity 0.696360 worst concavity 0.659610 mean compactness 0.596534 worst compactness 0.590998 radius error 0.567134 perimeter error 0.556141 area error 0.548236 worst texture 0.456903 worst smoothness 0.421465 worst symmetry 0.416294 mean texture 0.415185 concave points error 0.408042 mean smoothness 0.358560 mean symmetry 0.330499 worst fractal dimension 0.323872 compactness error 0.292999 concavity error 0.253730 fractal dimension error 0.077972 symmetry error -0.006522 texture error -0.008303 mean fractal dimension -0.012838 smoothness error -0.067016 Name: outcome, dtype: float64
data20 = pd.read_csv(breast_cancer)
print(data20.shape)
data20.head()
(569, 31)
mean radius | mean texture | mean perimeter | mean area | mean smoothness | mean compactness | mean concavity | mean concave points | mean symmetry | mean fractal dimension | radius error | texture error | perimeter error | area error | smoothness error | compactness error | concavity error | concave points error | symmetry error | fractal dimension error | worst radius | worst texture | worst perimeter | worst area | worst smoothness | worst compactness | worst concavity | worst concave points | worst symmetry | worst fractal dimension | outcome | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | 0.07871 | 1.0950 | 0.9053 | 8.589 | 153.40 | 0.006399 | 0.04904 | 0.05373 | 0.01587 | 0.03003 | 0.006193 | 25.38 | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | 1 |
1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | 0.05667 | 0.5435 | 0.7339 | 3.398 | 74.08 | 0.005225 | 0.01308 | 0.01860 | 0.01340 | 0.01389 | 0.003532 | 24.99 | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | 1 |
2 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | 0.2069 | 0.05999 | 0.7456 | 0.7869 | 4.585 | 94.03 | 0.006150 | 0.04006 | 0.03832 | 0.02058 | 0.02250 | 0.004571 | 23.57 | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | 1 |
3 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | 0.2597 | 0.09744 | 0.4956 | 1.1560 | 3.445 | 27.23 | 0.009110 | 0.07458 | 0.05661 | 0.01867 | 0.05963 | 0.009208 | 14.91 | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | 1 |
4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | 0.1809 | 0.05883 | 0.7572 | 0.7813 | 5.438 | 94.44 | 0.011490 | 0.02461 | 0.05688 | 0.01885 | 0.01756 | 0.005115 | 22.54 | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | 1 |
data20.rename(columns = {'mean radius': 'rayon moyen'},inplace=True)
data20.head()
rayon moyen | mean texture | mean perimeter | mean area | mean smoothness | mean compactness | mean concavity | mean concave points | mean symmetry | mean fractal dimension | radius error | texture error | perimeter error | area error | smoothness error | compactness error | concavity error | concave points error | symmetry error | fractal dimension error | worst radius | worst texture | worst perimeter | worst area | worst smoothness | worst compactness | worst concavity | worst concave points | worst symmetry | worst fractal dimension | outcome | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | 0.07871 | 1.0950 | 0.9053 | 8.589 | 153.40 | 0.006399 | 0.04904 | 0.05373 | 0.01587 | 0.03003 | 0.006193 | 25.38 | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | 1 |
1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | 0.05667 | 0.5435 | 0.7339 | 3.398 | 74.08 | 0.005225 | 0.01308 | 0.01860 | 0.01340 | 0.01389 | 0.003532 | 24.99 | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | 1 |
2 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | 0.2069 | 0.05999 | 0.7456 | 0.7869 | 4.585 | 94.03 | 0.006150 | 0.04006 | 0.03832 | 0.02058 | 0.02250 | 0.004571 | 23.57 | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | 1 |
3 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | 0.2597 | 0.09744 | 0.4956 | 1.1560 | 3.445 | 27.23 | 0.009110 | 0.07458 | 0.05661 | 0.01867 | 0.05963 | 0.009208 | 14.91 | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | 1 |
4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | 0.1809 | 0.05883 | 0.7572 | 0.7813 | 5.438 | 94.44 | 0.011490 | 0.02461 | 0.05688 | 0.01885 | 0.01756 | 0.005115 | 22.54 | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | 1 |
data20.columns.str.replace( ' ','_')
Index(['rayon_moyen', 'mean_texture', 'mean_perimeter', 'mean_area', 'mean_smoothness', 'mean_compactness', 'mean_concavity', 'mean_concave_points', 'mean_symmetry', 'mean_fractal_dimension', 'radius_error', 'texture_error', 'perimeter_error', 'area_error', 'smoothness_error', 'compactness_error', 'concavity_error', 'concave_points_error', 'symmetry_error', 'fractal_dimension_error', 'worst_radius', 'worst_texture', 'worst_perimeter', 'worst_area', 'worst_smoothness', 'worst_compactness', 'worst_concavity', 'worst_concave_points', 'worst_symmetry', 'worst_fractal_dimension', 'outcome'], dtype='object')
data20.columns = data20.columns.str.replace('\W+',"_")
C:\Users\Pb\anaconda3\envs\Franckeale04\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: The default value of regex will change from True to False in a future version. """Entry point for launching an IPython kernel.
data20.head()
rayon_moyen | mean_texture | mean_perimeter | mean_area | mean_smoothness | mean_compactness | mean_concavity | mean_concave_points | mean_symmetry | mean_fractal_dimension | radius_error | texture_error | perimeter_error | area_error | smoothness_error | compactness_error | concavity_error | concave_points_error | symmetry_error | fractal_dimension_error | worst_radius | worst_texture | worst_perimeter | worst_area | worst_smoothness | worst_compactness | worst_concavity | worst_concave_points | worst_symmetry | worst_fractal_dimension | outcome | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | 0.07871 | 1.0950 | 0.9053 | 8.589 | 153.40 | 0.006399 | 0.04904 | 0.05373 | 0.01587 | 0.03003 | 0.006193 | 25.38 | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | 1 |
1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | 0.05667 | 0.5435 | 0.7339 | 3.398 | 74.08 | 0.005225 | 0.01308 | 0.01860 | 0.01340 | 0.01389 | 0.003532 | 24.99 | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | 1 |
2 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | 0.2069 | 0.05999 | 0.7456 | 0.7869 | 4.585 | 94.03 | 0.006150 | 0.04006 | 0.03832 | 0.02058 | 0.02250 | 0.004571 | 23.57 | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | 1 |
3 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | 0.2597 | 0.09744 | 0.4956 | 1.1560 | 3.445 | 27.23 | 0.009110 | 0.07458 | 0.05661 | 0.01867 | 0.05963 | 0.009208 | 14.91 | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | 1 |
4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | 0.1809 | 0.05883 | 0.7572 | 0.7813 | 5.438 | 94.44 | 0.011490 | 0.02461 | 0.05688 | 0.01885 | 0.01756 | 0.005115 | 22.54 | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | 1 |
data20.radius_error
0 1.0950 1 0.5435 2 0.7456 3 0.4956 4 0.7572 ... 564 1.1760 565 0.7655 566 0.4564 567 0.7260 568 0.3857 Name: radius_error, Length: 569, dtype: float64
# !{sys.executable} -m pip install pandas-profiling
import sys
import pandas_profiling
pandas_profiling.ProfileReport(mtcars)