import pandas as pd
import numpy as np
df = pd.read_csv("C:/Local Code/Resources/Data/chipotle.tsv", sep = "\
t")
df.head()
order_id quantity item_name \
0 1 1 Chips and Fresh Tomato Salsa
1 1 1 Izze
2 1 1 Nantucket Nectar
3 1 1 Chips and Tomatillo-Green Chili Salsa
4 2 2 Chicken Bowl
choice_description item_price
0 NaN $2.39
1 [Clementine] $3.39
2 [Apple] $3.39
3 NaN $2.39
4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans... $16.98
df.info() # hiển thị thông tin từng cột một
#choice of description có nhiều NaN data, nên chỉ còn các choice sử
dụng được là 3376 data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4622 entries, 0 to 4621
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 order_id 4622 non-null int64
1 quantity 4622 non-null int64
2 item_name 4622 non-null object
3 choice_description 3376 non-null object
4 item_price 4622 non-null object
dtypes: int64(2), object(3)
memory usage: 180.7+ KB
df.columns # 1 dạng object
#muốn chuyển sang list thì chỉ việc
list(df.columns)
['order_id', 'quantity', 'item_name', 'choice_description',
'item_price']
df.index # tìm index, muốn tìm data return từ bao nhiêu đến bao
nhiêu
RangeIndex(start=0, stop=4622, step=1)
#describe: return statistics summary
df.describe()
order_id quantity
count 4622.000000 4622.000000
mean 927.254868 1.075725
std 528.890796 0.410186
min 1.000000 1.000000
25% 477.250000 1.000000
50% 926.000000 1.000000
75% 1393.000000 1.000000
max 1834.000000 15.000000
df.describe(include = "all")
order_id quantity item_name choice_description
item_price
count 4622.000000 4622.000000 4622 3376
4622
unique NaN NaN 50 1043
78
top NaN NaN Chicken Bowl [Diet Coke]
$8.75
freq NaN NaN 726 134
730
mean 927.254868 1.075725 NaN NaN
NaN
std 528.890796 0.410186 NaN NaN
NaN
min 1.000000 1.000000 NaN NaN
NaN
25% 477.250000 1.000000 NaN NaN
NaN
50% 926.000000 1.000000 NaN NaN
NaN
75% 1393.000000 1.000000 NaN NaN
NaN
max 1834.000000 15.000000 NaN NaN
NaN
df.describe(percentiles = [0.1,0.3,0.24,0.44] )
order_id quantity
count 4622.000000 4622.000000
mean 927.254868 1.075725
std 528.890796 0.410186
min 1.000000 1.000000
10% 198.000000 1.000000
24% 458.040000 1.000000
30% 563.000000 1.000000
44% 818.000000 1.000000
50% 926.000000 1.000000
max 1834.000000 15.000000
df.head()
order_id quantity item_name \
0 1 1 Chips and Fresh Tomato Salsa
1 1 1 Izze
2 1 1 Nantucket Nectar
3 1 1 Chips and Tomatillo-Green Chili Salsa
4 2 2 Chicken Bowl
choice_description item_price
0 NaN $2.39
1 [Clementine] $3.39
2 [Apple] $3.39
3 NaN $2.39
4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans... $16.98
df.loc[(df.quantity == 15) | (df.item_name == "Nantucket Nectar")]
order_id quantity item_name \
2 1 1 Nantucket Nectar
22 11 1 Nantucket Nectar
105 46 1 Nantucket Nectar
173 77 1 Nantucket Nectar
205 91 1 Nantucket Nectar
436 189 1 Nantucket Nectar
601 247 2 Nantucket Nectar
925 381 1 Nantucket Nectar
1356 553 1 Nantucket Nectar
1585 641 1 Nantucket Nectar
1626 656 1 Nantucket Nectar
1706 690 1 Nantucket Nectar
2162 872 1 Nantucket Nectar
2379 947 2 Nantucket Nectar
2381 947 1 Nantucket Nectar
2430 965 1 Nantucket Nectar
2653 1053 1 Nantucket Nectar
2818 1118 1 Nantucket Nectar
2838 1128 1 Nantucket Nectar
2853 1133 1 Nantucket Nectar
2949 1172 1 Nantucket Nectar
3318 1330 1 Nantucket Nectar
3368 1351 1 Nantucket Nectar
3570 1433 1 Nantucket Nectar
3598 1443 15 Chips and Fresh Tomato Salsa
3845 1541 1 Nantucket Nectar
4019 1609 1 Nantucket Nectar
4078 1632 1 Nantucket Nectar
choice_description item_price
2 [Apple] $3.39
22 [Pomegranate Cherry] $3.39
105 [Pineapple Orange Banana] $3.39
173 [Apple] $3.39
205 [Peach Orange] $3.39
436 [Pomegranate Cherry] $3.39
601 [Pineapple Orange Banana] $6.78
925 [Pomegranate Cherry] $3.39
1356 [Pomegranate Cherry] $3.39
1585 [Peach Orange] $3.39
1626 [Pineapple Orange Banana] $3.39
1706 [Apple] $3.39
2162 [Pineapple Orange Banana] $3.39
2379 [Peach Orange] $6.78
2381 [Apple] $3.39
2430 [Pomegranate Cherry] $3.39
2653 [Pineapple Orange Banana] $3.39
2818 [Apple] $3.39
2838 [Peach Orange] $3.39
2853 [Apple] $3.39
2949 [Peach Orange] $3.39
3318 [Peach Orange] $3.39
3368 [Pineapple Orange Banana] $3.39
3570 [Pineapple Orange Banana] $3.39
3598 NaN $44.25
3845 [Peach Orange] $3.39
4019 [Pineapple Orange Banana] $3.39
4078 [Peach Orange] $3.39
#loc: chỉ chọn hàng mong muốn
print(type(df.loc[1]))
print(type(df.iloc[1]))
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
df.loc[(df.quantity ==2) & (df.item_name == "Nantucket Nectar"),
["order_id","quantity","item_name"]]
order_id quantity item_name
601 247 2 Nantucket Nectar
2379 947 2 Nantucket Nectar
df.loc[(df.quantity ==2) & (df.item_name == "Nantucket Nectar")]
order_id quantity item_name choice_description
\
601 247 2 Nantucket Nectar [Pineapple Orange Banana]
2379 947 2 Nantucket Nectar [Peach Orange]
item_price
601 $6.78
2379 $6.78
df.loc[(df.quantity >= 2) & (df.item_name == "Nantucket Nectar")]
order_id quantity item_name choice_description
\
601 247 2 Nantucket Nectar [Pineapple Orange Banana]
2379 947 2 Nantucket Nectar [Peach Orange]
item_price
601 $6.78
2379 $6.78
#iloc: chọn cột và hàng mong muốn
df.iloc[3:5, :-1]# từ iloc có thể tách cột, xóa cột
order_id quantity item_name \
3 1 1 Chips and Tomatillo-Green Chili Salsa
4 2 2 Chicken Bowl
choice_description
3 NaN
4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans...
df.iloc[3:5]
order_id quantity item_name \
3 1 1 Chips and Tomatillo-Green Chili Salsa
4 2 2 Chicken Bowl
choice_description item_price
3 NaN $2.39
4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans... $16.98
df.iloc[3:5, -1] # chỉ hiển thị cột hàng index chọn vị trí index -1
3 $2.39
4 $16.98
Name: item_price, dtype: object
Data Manipulation
df.item_price.dtype
dtype('O')
#hàm xử lí chuyển đổi type trong dataframe
#với Func: Apply
df["item_price"]=df["item_price"].apply(lambda x: x.replace("$"," "))
df["item_price"].dtype
dtype('O')
df.head()
order_id quantity item_name \
0 1 1 Chips and Fresh Tomato Salsa
1 1 1 Izze
2 1 1 Nantucket Nectar
3 1 1 Chips and Tomatillo-Green Chili Salsa
4 2 2 Chicken Bowl
choice_description item_price
0 NaN 2.39
1 [Clementine] 3.39
2 [Apple] 3.39
3 NaN 2.39
4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans... 16.98
print(df.dtype)
----------------------------------------------------------------------
-----
AttributeError Traceback (most recent call
last)
~\AppData\Local\Temp\ipykernel_25428\910862699.py in ?()
----> 1 print(df.dtype)
C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\generic.py in ?
(self, name)
6295 and name not in self._accessors
6296 and
self._info_axis._can_hold_identifiers_and_holds_name(name)
6297 ):
6298 return self[name]
-> 6299 return object.__getattribute__(self, name)
AttributeError: 'DataFrame' object has no attribute 'dtype'
df["item_price"] = df["item_price"].astype(float)
correlative = df[["order_id", "item_price"]].corr()
print(correlative)