-
-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Labels
Description
Pandas allows for group-wise computation of the correlation coefficient.
In [4]: df = pd.DataFrame(np.random.randn(100, 4))
In [5]: df = pd.DataFrame(np.random.randn(100, 4), columns=list('abcd'))
In [6]: df['key'] = np.random.randint(0, 4, size=100)
In [7]: ddf = dd.from_pandas(df, 2)
In [8]: df.groupby('key').corr()
Out[8]:
a b c d
key
0 a 1.000000 0.012920 0.134284 0.087320
b 0.012920 1.000000 0.169226 -0.318727
c 0.134284 0.169226 1.000000 0.183607
d 0.087320 -0.318727 0.183607 1.000000
1 a 1.000000 -0.010823 -0.351167 -0.156504
b -0.010823 1.000000 -0.056893 -0.134058
c -0.351167 -0.056893 1.000000 -0.188640
d -0.156504 -0.134058 -0.188640 1.000000
2 a 1.000000 0.062685 -0.211482 0.239097
b 0.062685 1.000000 -0.085522 0.219293
c -0.211482 -0.085522 1.000000 -0.347176
d 0.239097 0.219293 -0.347176 1.000000
3 a 1.000000 -0.078373 0.070477 -0.185380
b -0.078373 1.000000 0.218543 0.053537
c 0.070477 0.218543 1.000000 0.070091
d -0.185380 0.053537 0.070091 1.000000xref #4372
cc @quasiben, @jangorecki