pandas-tutorial
November 29, 2024
[7]: import pandas as pd
import numpy as np
df = pd.read_csv("C:/Local Code/Resources/Data/chipotle.tsv", sep = "\t")
[8]: df.head()
[8]: order_id quantity item_name \
0 1 1 Chips and Fresh Tomato Salsa
1 1 1 Izze
2 1 1 Nantucket Nectar
3 1 1 Chips and Tomatillo-Green Chili Salsa
4 2 2 Chicken Bowl
choice_description item_price
0 NaN $2.39
1 [Clementine] $3.39
2 [Apple] $3.39
3 NaN $2.39
4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans… $16.98
[9]: df.info() # hiển thị thông tin từng cột một
#choice of description có nhiều NaN data, nên chỉ còn các choice sử dụng được␣
↪là 3376 data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4622 entries, 0 to 4621
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 order_id 4622 non-null int64
1 quantity 4622 non-null int64
2 item_name 4622 non-null object
3 choice_description 3376 non-null object
4 item_price 4622 non-null object
dtypes: int64(2), object(3)
memory usage: 180.7+ KB
1
[10]: df.columns # 1 dạng object
#muốn chuyển sang list thì chỉ việc
list(df.columns)
[10]: ['order_id', 'quantity', 'item_name', 'choice_description', 'item_price']
[11]: df.index # tìm index, muốn tìm data return từ bao nhiêu đến bao nhiêu
[11]: RangeIndex(start=0, stop=4622, step=1)
[12]: #describe: return statistics summary
df.describe()
[12]: order_id quantity
count 4622.000000 4622.000000
mean 927.254868 1.075725
std 528.890796 0.410186
min 1.000000 1.000000
25% 477.250000 1.000000
50% 926.000000 1.000000
75% 1393.000000 1.000000
max 1834.000000 15.000000
[13]: df.describe(include = "all")
[13]: order_id quantity item_name choice_description item_price
count 4622.000000 4622.000000 4622 3376 4622
unique NaN NaN 50 1043 78
top NaN NaN Chicken Bowl [Diet Coke] $8.75
freq NaN NaN 726 134 730
mean 927.254868 1.075725 NaN NaN NaN
std 528.890796 0.410186 NaN NaN NaN
min 1.000000 1.000000 NaN NaN NaN
25% 477.250000 1.000000 NaN NaN NaN
50% 926.000000 1.000000 NaN NaN NaN
75% 1393.000000 1.000000 NaN NaN NaN
max 1834.000000 15.000000 NaN NaN NaN
[14]: df.describe(percentiles = [0.1,0.3,0.24,0.44] )
[14]: order_id quantity
count 4622.000000 4622.000000
mean 927.254868 1.075725
std 528.890796 0.410186
min 1.000000 1.000000
10% 198.000000 1.000000
2
24% 458.040000 1.000000
30% 563.000000 1.000000
44% 818.000000 1.000000
50% 926.000000 1.000000
max 1834.000000 15.000000
Ilocation and Idex Location
[16]: df.head()
[16]: order_id quantity item_name \
0 1 1 Chips and Fresh Tomato Salsa
1 1 1 Izze
2 1 1 Nantucket Nectar
3 1 1 Chips and Tomatillo-Green Chili Salsa
4 2 2 Chicken Bowl
choice_description item_price
0 NaN $2.39
1 [Clementine] $3.39
2 [Apple] $3.39
3 NaN $2.39
4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans… $16.98
[17]: df.loc[(df.quantity == 15) | (df.item_name == "Nantucket Nectar")]
[17]: order_id quantity item_name \
2 1 1 Nantucket Nectar
22 11 1 Nantucket Nectar
105 46 1 Nantucket Nectar
173 77 1 Nantucket Nectar
205 91 1 Nantucket Nectar
436 189 1 Nantucket Nectar
601 247 2 Nantucket Nectar
925 381 1 Nantucket Nectar
1356 553 1 Nantucket Nectar
1585 641 1 Nantucket Nectar
1626 656 1 Nantucket Nectar
1706 690 1 Nantucket Nectar
2162 872 1 Nantucket Nectar
2379 947 2 Nantucket Nectar
2381 947 1 Nantucket Nectar
2430 965 1 Nantucket Nectar
2653 1053 1 Nantucket Nectar
2818 1118 1 Nantucket Nectar
2838 1128 1 Nantucket Nectar
2853 1133 1 Nantucket Nectar
3
2949 1172 1 Nantucket Nectar
3318 1330 1 Nantucket Nectar
3368 1351 1 Nantucket Nectar
3570 1433 1 Nantucket Nectar
3598 1443 15 Chips and Fresh Tomato Salsa
3845 1541 1 Nantucket Nectar
4019 1609 1 Nantucket Nectar
4078 1632 1 Nantucket Nectar
choice_description item_price
2 [Apple] $3.39
22 [Pomegranate Cherry] $3.39
105 [Pineapple Orange Banana] $3.39
173 [Apple] $3.39
205 [Peach Orange] $3.39
436 [Pomegranate Cherry] $3.39
601 [Pineapple Orange Banana] $6.78
925 [Pomegranate Cherry] $3.39
1356 [Pomegranate Cherry] $3.39
1585 [Peach Orange] $3.39
1626 [Pineapple Orange Banana] $3.39
1706 [Apple] $3.39
2162 [Pineapple Orange Banana] $3.39
2379 [Peach Orange] $6.78
2381 [Apple] $3.39
2430 [Pomegranate Cherry] $3.39
2653 [Pineapple Orange Banana] $3.39
2818 [Apple] $3.39
2838 [Peach Orange] $3.39
2853 [Apple] $3.39
2949 [Peach Orange] $3.39
3318 [Peach Orange] $3.39
3368 [Pineapple Orange Banana] $3.39
3570 [Pineapple Orange Banana] $3.39
3598 NaN $44.25
3845 [Peach Orange] $3.39
4019 [Pineapple Orange Banana] $3.39
4078 [Peach Orange] $3.39
[18]: #loc: chỉ chọn hàng mong muốn
print(type(df.loc[1]))
print(type(df.iloc[1]))
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
4
[19]: df.loc[(df.quantity ==2) & (df.item_name == "Nantucket Nectar"),␣
↪["order_id","quantity","item_name"]]
[19]: order_id quantity item_name
601 247 2 Nantucket Nectar
2379 947 2 Nantucket Nectar
[20]: df.loc[(df.quantity ==2) & (df.item_name == "Nantucket Nectar")]
[20]: order_id quantity item_name choice_description \
601 247 2 Nantucket Nectar [Pineapple Orange Banana]
2379 947 2 Nantucket Nectar [Peach Orange]
item_price
601 $6.78
2379 $6.78
[21]: df.loc[(df.quantity >= 2) & (df.item_name == "Nantucket Nectar")]
[21]: order_id quantity item_name choice_description \
601 247 2 Nantucket Nectar [Pineapple Orange Banana]
2379 947 2 Nantucket Nectar [Peach Orange]
item_price
601 $6.78
2379 $6.78
[22]: #iloc: chọn cột và hàng mong muốn
[23]: df.iloc[3:5, :-1]# từ iloc có thể tách cột, xóa cột
[23]: order_id quantity item_name \
3 1 1 Chips and Tomatillo-Green Chili Salsa
4 2 2 Chicken Bowl
choice_description
3 NaN
4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans…
[24]: df.iloc[3:5]
[24]: order_id quantity item_name \
3 1 1 Chips and Tomatillo-Green Chili Salsa
4 2 2 Chicken Bowl
choice_description item_price
3 NaN $2.39
5
4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans… $16.98
[25]: df.iloc[3:5, -1] # chỉ hiển thị cột hàng index chọn vị trí index -1
[25]: 3 $2.39
4 $16.98
Name: item_price, dtype: object
0.0.1 Data Manipulation
[27]: df.item_price.dtype
[27]: dtype('O')
[28]: #hàm xử lí chuyển đổi type trong dataframe
#với Func: Apply
[31]: df["item_price"]=df["item_price"].apply(lambda x: x.replace("$"," "))
[37]: df["item_price"].dtype
[37]: dtype('O')
[33]: df.head()
[33]: order_id quantity item_name \
0 1 1 Chips and Fresh Tomato Salsa
1 1 1 Izze
2 1 1 Nantucket Nectar
3 1 1 Chips and Tomatillo-Green Chili Salsa
4 2 2 Chicken Bowl
choice_description item_price
0 NaN 2.39
1 [Clementine] 3.39
2 [Apple] 3.39
3 NaN 2.39
4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans… 16.98
[30]: print(df.dtype)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_25428\910862699.py in ?()
----> 1 print(df.dtype)
6
C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\generic.py in ?(self,␣
↪name)
6295 and name not in self._accessors
6296 and self._info_axis.
↪_can_hold_identifiers_and_holds_name(name)
6297 ):
6298 return self[name]
-> 6299 return object.__getattribute__(self, name)
AttributeError: 'DataFrame' object has no attribute 'dtype'
[ ]: df["item_price"] = df["item_price"].astype(float)
[ ]: correlative = df[["order_id", "item_price"]].corr()
print(correlative)