如何系统地学习Python 中 matplotlib, numpy, scipy, pandas?

Posted by zluckyH on July 16, 2016

听说python科学计算的numbpypandas特别厉害,于是想尝试一下,搜素知乎,发现了这篇问答:如何系统地学习Python 中 matplotlib, numpy, scipy, pandas?,顺藤摸瓜,发现了几个学习资料,暂且收集在这里。

ok,资料齐全,接下来开始学习。

NumPy 基础:数组和矢量计算

import numpy as np
import pandas as pd
print 'hello world'
hello world

创建ndarray

data1 = [6,7.5,8,0,1]
arr1 = np.array(data1)
arr1
array([ 6. ,  7.5,  8. ,  0. ,  1. ])
data2 = [[1,2,3,4],[5,6,7,8]]
arr2 = np.array(data2)
arr2
array([[1, 2, 3, 4],
       [5, 6, 7, 8]])
arr2.ndim
2
arr2.shape
(2L, 4L)
arr2.dtype
dtype('int32')
np.zeros(10)
array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])
np.zeros((3,6))
array([[ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.]])
np.ones((3,4))
array([[ 1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.]])
np.empty((2,3,4))
array([[[  1.82577335e-316,   8.73107750e-316,   1.81818490e-316,
           3.13307342e-316],
        [  2.08418905e-316,   8.73107987e-316,   1.81818490e-316,
           8.72472026e-316],
        [  2.08418905e-316,   8.73108224e-316,   1.81818490e-316,
           8.72315545e-316]],

       [[  2.08418905e-316,   8.73108462e-316,   1.81818490e-316,
           8.72802101e-316],
        [  2.08418905e-316,   8.73108699e-316,   1.81818490e-316,
           8.72462540e-316],
        [  2.08418905e-316,   8.72808228e-316,   1.81818490e-316,
           8.73047909e-316]]])
np.arange(15)
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])
np.eye(10)
array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]])
np.identity(10)
array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]])
b = np.array([(1.5,2,3),(4,5,6)])
b
array([[ 1.5,  2. ,  3. ],
       [ 4. ,  5. ,  6. ]])
c = np.array([[1,2],[3,4]],dtype = complex)
c
array([[ 1.+0.j,  2.+0.j],
       [ 3.+0.j,  4.+0.j]])
a = np.arange(10)
print a
[0 1 2 3 4 5 6 7 8 9]
b = np.arange(12).reshape(4,3)
print b
[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]
c = np.arange(24).reshape(2,3,4)
print c
[[[ 0  1  2  3]
  [ 4  5  6  7]
  [ 8  9 10 11]]

 [[12 13 14 15]
  [16 17 18 19]
  [20 21 22 23]]]

数组类型

arr = np.array([1,2,3,4,5])
arr.dtype
dtype('int32')
float_arr = arr.astype(np.float32)
arr.dtype
dtype('int32')
arr = np.array([3.7,-1.2,-2.6,0.5,12.9,10.1])
arr.dtype

dtype('float64')
arr.astype(np.int32)
array([ 3, -1, -2,  0, 12, 10])
numeric_strings = np.array(['1.25','-9.6','42'],dtype = np.string_)
numeric_strings.dtype

dtype('S4')
numeric_strings.astype(float)
array([  1.25,  -9.6 ,  42.  ])
arr = np.arange(10)
arr
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

索引和切片

arr[5]
5
arr[5:8]
array([5, 6, 7])
arr[5:8] =12
arr
array([ 0,  1,  2,  3,  4, 12, 12, 12,  8,  9])
arr_slice = arr[5:8]
arr_slice[1] = 12345
arr_slice
array([   12, 12345,    12])
arr
array([    0,     1,     2,     3,     4,    12, 12345,    12,     8,     9])
arr_slice[:] = 64
arr_slice
arr
array([ 0,  1,  2,  3,  4, 64, 64, 64,  8,  9])
a = np.arange(10)
a[::-1]
array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])
a[2:9:3]#[start:end:step]
array([2, 5, 8])

基本运算

a = np.array([20,30,40,50])
b = np.arange(4)
b
array([0, 1, 2, 3])

c = a-b c

NumPy中的乘法运算符 * 指示按元素计算,矩阵乘法可以使用 dot 函数或创建矩阵对象实现

A = np.array([[1,1],[0,1]])
B = np.array([[2,0],[3,4]])
A*B
array([[2, 0],
       [0, 4]])
np.dot(A,B)
array([[5, 4],
       [3, 4]])

可视化

import matplotlib.pyplot as plt
%matplotlib inline
x = np.linspace(0,3,20)
y = np.linspace(0,9,20)
plt.plot(x,y)
plt.show()

png

image = np.random.rand(30,30)
plt.imshow(image,cmap=plt.cm.hot)
plt.colorbar()
<matplotlib.colorbar.Colorbar at 0xc31ccf8>

png

Worked Example: data statistics

data = np.loadtxt('C:\Users\zluck\Documents\Python\Python materials\scipy-lecture-notes-master\data\populations.txt')
year,hares,lynxes,carrots = data.T
plt.axes([0.2,0.1,0.5,0.8])
plt.plot(year,hares,year,lynxes,year,carrots)
plt.legend(('Hare','Lynx','Carrot'),loc = (1.05,0.5))
<matplotlib.legend.Legend at 0xd0c2b70>

png

populations = data[:,1:]
populations.mean(axis=0)
array([ 34080.95238095,  20166.66666667,  42400.        ])
#Which species has the highest population each year
np.argmax(populations,axis=1)
array([2, 2, 0, 0, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 0, 1, 2, 2, 2, 2, 2], dtype=int64)

Broadcasting

A lot of grid-based or network-based problems can also use broadcasting. For instance, if we want to compute the distance from the origin of points on a 10x10 grid, we can do

x,y = np.arange(5),np.arange(5)[:,np.newaxis]
distance = np.sqrt(x**2+y**2)
distance
array([[ 0.        ,  1.        ,  2.        ,  3.        ,  4.        ],
       [ 1.        ,  1.41421356,  2.23606798,  3.16227766,  4.12310563],
       [ 2.        ,  2.23606798,  2.82842712,  3.60555128,  4.47213595],
       [ 3.        ,  3.16227766,  3.60555128,  4.24264069,  5.        ],
       [ 4.        ,  4.12310563,  4.47213595,  5.        ,  5.65685425]])
plt.pcolor(distance)
plt.colorbar()
<matplotlib.colorbar.Colorbar at 0xd390cf8>

png

x,y = np.ogrid[0:5,0:5]
x,y
(array([[0],
        [1],
        [2],
        [3],
        [4]]), array([[0, 1, 2, 3, 4]]))
x.shape,y.shape
((5L, 1L), (1L, 5L))
distance = np.sqrt(x**2+y**2)
distance
array([[ 0.        ,  1.        ,  2.        ,  3.        ,  4.        ],
       [ 1.        ,  1.41421356,  2.23606798,  3.16227766,  4.12310563],
       [ 2.        ,  2.23606798,  2.82842712,  3.60555128,  4.47213595],
       [ 3.        ,  3.16227766,  3.60555128,  4.24264069,  5.        ],
       [ 4.        ,  4.12310563,  4.47213595,  5.        ,  5.65685425]])
x,y = np.mgrid[0:4,0:4]
x
array([[0, 0, 0, 0],
       [1, 1, 1, 1],
       [2, 2, 2, 2],
       [3, 3, 3, 3]])
y
array([[0, 1, 2, 3],
       [0, 1, 2, 3],
       [0, 1, 2, 3],
       [0, 1, 2, 3]])

Array shape manipulation

Flattening

a = np.array([[1,2,3],[4,5,6]])
a.ravel()
array([1, 2, 3, 4, 5, 6])
a.T
array([[1, 4],
       [2, 5],
       [3, 6]])
a.T.ravel()
array([1, 4, 2, 5, 3, 6])

Reshaping

a.shape
(2L, 3L)
b = a.ravel()
b = b.reshape((2,3))
b
array([[1, 2, 3],
       [4, 5, 6]])
b[0,0] = 99
a
array([[99,  2,  3],
       [ 4,  5,  6]])
a = np.zeros((3,2))
b = a.T.reshape((3*2))
b
array([ 0.,  0.,  0.,  0.,  0.,  0.])
b[0]=9
a
array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])

Dimension shuffling

a = np.arange(4*3*2).reshape(4,3,2)
a.shape
(4L, 3L, 2L)
a[0,2,1]
5
b = a.transpose(1,2,0)
b.shape
(3L, 2L, 4L)
b[2,1,0]
5
b[2,1,0] = -1
a[0,2,1]
-1

Sorting Data

a = np.array([[4,3,5],[1,2,1]])
b = np.sort(a,axis = 1)
b
array([[3, 4, 5],
       [1, 1, 2]])
a.sort(axis = 1)
a
array([[1, 1, 2],
       [3, 4, 5]])
a = np.array([4,3,1,2])
j = np.argsort(a)
j
array([2, 3, 1, 0], dtype=int64)
a[j]
array([1, 2, 3, 4])
a.sort()
a
array([1, 2, 3, 4])
a = np.array([4,3,1,2])
j_max = np.argmax(a)
j_min = np.argmin(a)
j_max, j_min
(0, 2)
samples = np.zeros((6,),dtype = [('sensor_code','S4'),('position',float),('value',float)])
samples
array([('', 0.0, 0.0), ('', 0.0, 0.0), ('', 0.0, 0.0), ('', 0.0, 0.0),
       ('', 0.0, 0.0), ('', 0.0, 0.0)], 
      dtype=[('sensor_code', 'S4'), ('position', '<f8'), ('value', '<f8')])
samples.ndim
1
samples.shape
(6L,)
samples.dtype.names
('sensor_code', 'position', 'value')
samples[:] = [('ALFA', 1, 0.37), ('BETA', 1, 0.11), ('TAU', 1, 0.13),
('ALFA', 1.5, 0.37), ('ALFA', 3, 0.11), ('TAU', 1.2, 0.13)]
samples
array([('ALFA', 1.0, 0.37), ('BETA', 1.0, 0.11), ('TAU', 1.0, 0.13),
       ('ALFA', 1.5, 0.37), ('ALFA', 3.0, 0.11), ('TAU', 1.2, 0.13)], 
      dtype=[('sensor_code', 'S4'), ('position', '<f8'), ('value', '<f8')])
samples['sensor_code']
array(['ALFA', 'BETA', 'TAU', 'ALFA', 'ALFA', 'TAU'], 
      dtype='|S4')
samples['value']
array([ 0.37,  0.11,  0.13,  0.37,  0.11,  0.13])
samples[0]
('ALFA', 1.0, 0.37)
samples[['position','value']]
array([(1.0, 0.37), (1.0, 0.11), (1.0, 0.13), (1.5, 0.37), (3.0, 0.11),
       (1.2, 0.13)], 
      dtype=[('position', '<f8'), ('value', '<f8')])
samples[samples['sensor_code']=='ALFA']
array([('ALFA', 1.0, 0.37), ('ALFA', 1.5, 0.37), ('ALFA', 3.0, 0.11)], 
      dtype=[('sensor_code', 'S4'), ('position', '<f8'), ('value', '<f8')])
x = np.arange(10)
y = np.array([2,7,13])
np.in1d(x,y)  #similar to %in% of R
array([False, False,  True, False, False, False, False,  True, False, False], dtype=bool)

maskedarray: dealing with (propagation of) missing data

For floats one could use NaN’s, but masks work for all types:

x = np.ma.array([1,2,3,4],mask = [0,1,0,1])
x
masked_array(data = [1 -- 3 --],
             mask = [False  True False  True],
       fill_value = 999999)
y = np.ma.array([1,2,3,4],mask = [0,1,1,1])
x+y
masked_array(data = [2 -- -- --],
             mask = [False  True  True  True],
       fill_value = 999999)
np.ma.sqrt([1,-1,2,-2])
masked_array(data = [1.0 -- 1.4142135623730951 --],
             mask = [False  True False  True],
       fill_value = 1e+20)

Advanced operations

Polynomials

for example, $3x^2 + 2x + 1$:

p = np.poly1d([3,2,-1])
p(0)
-1
p.roots
array([-1.        ,  0.33333333])
p.order
2
x = np.linspace(0,1,20)
y = np.cos(x) + 0.3*np.random.rand(20)
p = np.poly1d(np.polyfit(x,y,3))
t = np.linspace(0,1,200)
plt.plot(x,y,'o',t,p(t),'-')
[<matplotlib.lines.Line2D at 0xcb74e80>,
 <matplotlib.lines.Line2D at 0xcb74f60>]

png

Loading data files

pwd
u'C:\\Users\\zluck\\Documents\\Python'
cd C:\Users\zluck\Documents\Python\Python materials\scipy-lecture-notes-master\data
C:\Users\zluck\Documents\Python\Python materials\scipy-lecture-notes-master\data
data = np.loadtxt('populations.txt')
data
array([[  1900.,  30000.,   4000.,  48300.],
       [  1901.,  47200.,   6100.,  48200.],
       [  1902.,  70200.,   9800.,  41500.],
       [  1903.,  77400.,  35200.,  38200.],
       [  1904.,  36300.,  59400.,  40600.],
       [  1905.,  20600.,  41700.,  39800.],
       [  1906.,  18100.,  19000.,  38600.],
       [  1907.,  21400.,  13000.,  42300.],
       [  1908.,  22000.,   8300.,  44500.],
       [  1909.,  25400.,   9100.,  42100.],
       [  1910.,  27100.,   7400.,  46000.],
       [  1911.,  40300.,   8000.,  46800.],
       [  1912.,  57000.,  12300.,  43800.],
       [  1913.,  76600.,  19500.,  40900.],
       [  1914.,  52300.,  45700.,  39400.],
       [  1915.,  19500.,  51100.,  39000.],
       [  1916.,  11200.,  29700.,  36700.],
       [  1917.,   7600.,  15800.,  41800.],
       [  1918.,  14600.,   9700.,  43300.],
       [  1919.,  16200.,  10100.,  41300.],
       [  1920.,  24700.,   8600.,  47300.]])
np.savetxt('pop2.txt',data)
data2 = np.loadtxt('pop2.txt')
img = plt.imread('elephant.png')
img.shape,img.dtype
((200L, 300L, 3L), dtype('float32'))
plt.imshow(img)
<matplotlib.image.AxesImage at 0xc422cc0>

png

plt.savefig('plot.png')
<matplotlib.figure.Figure at 0xc3287b8>
plt.imsave('red_element',img[:,:,0],cmap = plt.cm.gray)
plt.imshow(plt.imread('red_elephant.png'))
<matplotlib.image.AxesImage at 0xcea10f0>

png