听说python科学计算的numbpy和pandas特别厉害，于是想尝试一下，搜素知乎，发现了这篇问答：如何系统地学习Python 中 matplotlib, numpy, scipy, pandas？，顺藤摸瓜，发现了几个学习资料，暂且收集在这里。

ok，资料齐全，接下来开始学习。

NumPy 基础：数组和矢量计算

import numpy as np
import pandas as pd
print 'hello world'

hello world

创建ndarray

data1 = [6,7.5,8,0,1]
arr1 = np.array(data1)
arr1

array([ 6. ,  7.5,  8. ,  0. ,  1. ])

data2 = [[1,2,3,4],[5,6,7,8]]
arr2 = np.array(data2)
arr2

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

arr2.ndim

arr2.shape

(2L, 4L)

arr2.dtype

dtype('int32')

np.zeros(10)

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

np.zeros((3,6))

array([[ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.]])

np.ones((3,4))

array([[ 1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.]])

np.empty((2,3,4))

array([[[  1.82577335e-316,   8.73107750e-316,   1.81818490e-316,
           3.13307342e-316],
        [  2.08418905e-316,   8.73107987e-316,   1.81818490e-316,
           8.72472026e-316],
        [  2.08418905e-316,   8.73108224e-316,   1.81818490e-316,
           8.72315545e-316]],

       [[  2.08418905e-316,   8.73108462e-316,   1.81818490e-316,
           8.72802101e-316],
        [  2.08418905e-316,   8.73108699e-316,   1.81818490e-316,
           8.72462540e-316],
        [  2.08418905e-316,   8.72808228e-316,   1.81818490e-316,
           8.73047909e-316]]])

np.arange(15)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

np.eye(10)

array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]])

np.identity(10)

array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]])

b = np.array([(1.5,2,3),(4,5,6)])
b

array([[ 1.5,  2. ,  3. ],
       [ 4. ,  5. ,  6. ]])

c = np.array([[1,2],[3,4]],dtype = complex)
c

array([[ 1.+0.j,  2.+0.j],
       [ 3.+0.j,  4.+0.j]])

a = np.arange(10)
print a

[0 1 2 3 4 5 6 7 8 9]

b = np.arange(12).reshape(4,3)
print b

[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]

c = np.arange(24).reshape(2,3,4)
print c

[[[ 0  1  2  3]
  [ 4  5  6  7]
  [ 8  9 10 11]]

 [[12 13 14 15]
  [16 17 18 19]
  [20 21 22 23]]]

数组类型

arr = np.array([1,2,3,4,5])
arr.dtype

dtype('int32')

float_arr = arr.astype(np.float32)
arr.dtype

dtype('int32')

arr = np.array([3.7,-1.2,-2.6,0.5,12.9,10.1])
arr.dtype

dtype('float64')

arr.astype(np.int32)

array([ 3, -1, -2,  0, 12, 10])

numeric_strings = np.array(['1.25','-9.6','42'],dtype = np.string_)
numeric_strings.dtype

dtype('S4')

numeric_strings.astype(float)

array([  1.25,  -9.6 ,  42.  ])

arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

索引和切片

arr[5]

arr[5:8]

array([5, 6, 7])

arr[5:8] =12
arr

array([ 0,  1,  2,  3,  4, 12, 12, 12,  8,  9])

arr_slice = arr[5:8]
arr_slice[1] = 12345
arr_slice

array([   12, 12345,    12])

arr

array([    0,     1,     2,     3,     4,    12, 12345,    12,     8,     9])

arr_slice[:] = 64
arr_slice
arr

array([ 0,  1,  2,  3,  4, 64, 64, 64,  8,  9])

a = np.arange(10)
a[::-1]

array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])

a[2:9:3]#[start:end:step]

array([2, 5, 8])

基本运算

a = np.array([20,30,40,50])
b = np.arange(4)
b

array([0, 1, 2, 3])

c = a-b c

NumPy中的乘法运算符 * 指示按元素计算，矩阵乘法可以使用 dot 函数或创建矩阵对象实现

A = np.array([[1,1],[0,1]])
B = np.array([[2,0],[3,4]])

A*B

array([[2, 0],
       [0, 4]])

np.dot(A,B)

array([[5, 4],
       [3, 4]])

可视化

import matplotlib.pyplot as plt
%matplotlib inline

x = np.linspace(0,3,20)
y = np.linspace(0,9,20)
plt.plot(x,y)
plt.show()

png

image = np.random.rand(30,30)
plt.imshow(image,cmap=plt.cm.hot)
plt.colorbar()

<matplotlib.colorbar.Colorbar at 0xc31ccf8>

png

Worked Example: data statistics

data = np.loadtxt('C:\Users\zluck\Documents\Python\Python materials\scipy-lecture-notes-master\data\populations.txt')

year,hares,lynxes,carrots = data.T
plt.axes([0.2,0.1,0.5,0.8])
plt.plot(year,hares,year,lynxes,year,carrots)
plt.legend(('Hare','Lynx','Carrot'),loc = (1.05,0.5))

<matplotlib.legend.Legend at 0xd0c2b70>

png

populations = data[:,1:]
populations.mean(axis=0)

array([ 34080.95238095,  20166.66666667,  42400.        ])

#Which species has the highest population each year
np.argmax(populations,axis=1)

array([2, 2, 0, 0, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 0, 1, 2, 2, 2, 2, 2], dtype=int64)

Broadcasting

A lot of grid-based or network-based problems can also use broadcasting. For instance, if we want to compute the distance from the origin of points on a 10x10 grid, we can do

x,y = np.arange(5),np.arange(5)[:,np.newaxis]
distance = np.sqrt(x**2+y**2)
distance

array([[ 0.        ,  1.        ,  2.        ,  3.        ,  4.        ],
       [ 1.        ,  1.41421356,  2.23606798,  3.16227766,  4.12310563],
       [ 2.        ,  2.23606798,  2.82842712,  3.60555128,  4.47213595],
       [ 3.        ,  3.16227766,  3.60555128,  4.24264069,  5.        ],
       [ 4.        ,  4.12310563,  4.47213595,  5.        ,  5.65685425]])

plt.pcolor(distance)
plt.colorbar()

<matplotlib.colorbar.Colorbar at 0xd390cf8>

png

x,y = np.ogrid[0:5,0:5]
x,y

(array([[0],
        [1],
        [2],
        [3],
        [4]]), array([[0, 1, 2, 3, 4]]))

x.shape,y.shape

((5L, 1L), (1L, 5L))

distance = np.sqrt(x**2+y**2)
distance

array([[ 0.        ,  1.        ,  2.        ,  3.        ,  4.        ],
       [ 1.        ,  1.41421356,  2.23606798,  3.16227766,  4.12310563],
       [ 2.        ,  2.23606798,  2.82842712,  3.60555128,  4.47213595],
       [ 3.        ,  3.16227766,  3.60555128,  4.24264069,  5.        ],
       [ 4.        ,  4.12310563,  4.47213595,  5.        ,  5.65685425]])

x,y = np.mgrid[0:4,0:4]
x

array([[0, 0, 0, 0],
       [1, 1, 1, 1],
       [2, 2, 2, 2],
       [3, 3, 3, 3]])

array([[0, 1, 2, 3],
       [0, 1, 2, 3],
       [0, 1, 2, 3],
       [0, 1, 2, 3]])

Array shape manipulation

Flattening

a = np.array([[1,2,3],[4,5,6]])
a.ravel()

array([1, 2, 3, 4, 5, 6])

a.T

array([[1, 4],
       [2, 5],
       [3, 6]])

a.T.ravel()

array([1, 4, 2, 5, 3, 6])

Reshaping

a.shape

(2L, 3L)

b = a.ravel()
b = b.reshape((2,3))
b

array([[1, 2, 3],
       [4, 5, 6]])

b[0,0] = 99
a

array([[99,  2,  3],
       [ 4,  5,  6]])

a = np.zeros((3,2))
b = a.T.reshape((3*2))
b

array([ 0.,  0.,  0.,  0.,  0.,  0.])

b[0]=9
a

array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])

Dimension shuffling

a = np.arange(4*3*2).reshape(4,3,2)
a.shape

(4L, 3L, 2L)

a[0,2,1]

b = a.transpose(1,2,0)
b.shape

(3L, 2L, 4L)

b[2,1,0]

b[2,1,0] = -1
a[0,2,1]

-1

Sorting Data

a = np.array([[4,3,5],[1,2,1]])
b = np.sort(a,axis = 1)
b

array([[3, 4, 5],
       [1, 1, 2]])

a.sort(axis = 1)
a

array([[1, 1, 2],
       [3, 4, 5]])

a = np.array([4,3,1,2])
j = np.argsort(a)
j

array([2, 3, 1, 0], dtype=int64)

a[j]

array([1, 2, 3, 4])

a.sort()
a

array([1, 2, 3, 4])

a = np.array([4,3,1,2])
j_max = np.argmax(a)
j_min = np.argmin(a)
j_max, j_min

(0, 2)

samples = np.zeros((6,),dtype = [('sensor_code','S4'),('position',float),('value',float)])
samples

array([('', 0.0, 0.0), ('', 0.0, 0.0), ('', 0.0, 0.0), ('', 0.0, 0.0),
       ('', 0.0, 0.0), ('', 0.0, 0.0)], 
      dtype=[('sensor_code', 'S4'), ('position', '<f8'), ('value', '<f8')])

samples.ndim

samples.shape

(6L,)

samples.dtype.names

('sensor_code', 'position', 'value')

samples[:] = [('ALFA', 1, 0.37), ('BETA', 1, 0.11), ('TAU', 1, 0.13),
('ALFA', 1.5, 0.37), ('ALFA', 3, 0.11), ('TAU', 1.2, 0.13)]
samples

array([('ALFA', 1.0, 0.37), ('BETA', 1.0, 0.11), ('TAU', 1.0, 0.13),
       ('ALFA', 1.5, 0.37), ('ALFA', 3.0, 0.11), ('TAU', 1.2, 0.13)], 
      dtype=[('sensor_code', 'S4'), ('position', '<f8'), ('value', '<f8')])

samples['sensor_code']

array(['ALFA', 'BETA', 'TAU', 'ALFA', 'ALFA', 'TAU'], 
      dtype='|S4')

samples['value']

array([ 0.37,  0.11,  0.13,  0.37,  0.11,  0.13])

samples[0]

('ALFA', 1.0, 0.37)

samples[['position','value']]

array([(1.0, 0.37), (1.0, 0.11), (1.0, 0.13), (1.5, 0.37), (3.0, 0.11),
       (1.2, 0.13)], 
      dtype=[('position', '<f8'), ('value', '<f8')])

samples[samples['sensor_code']=='ALFA']

array([('ALFA', 1.0, 0.37), ('ALFA', 1.5, 0.37), ('ALFA', 3.0, 0.11)], 
      dtype=[('sensor_code', 'S4'), ('position', '<f8'), ('value', '<f8')])

x = np.arange(10)
y = np.array([2,7,13])
np.in1d(x,y)  #similar to %in% of R

array([False, False,  True, False, False, False, False,  True, False, False], dtype=bool)

maskedarray: dealing with (propagation of) missing data

For floats one could use NaN’s, but masks work for all types:

x = np.ma.array([1,2,3,4],mask = [0,1,0,1])
x

masked_array(data = [1 -- 3 --],
             mask = [False  True False  True],
       fill_value = 999999)

y = np.ma.array([1,2,3,4],mask = [0,1,1,1])
x+y

masked_array(data = [2 -- -- --],
             mask = [False  True  True  True],
       fill_value = 999999)

np.ma.sqrt([1,-1,2,-2])

masked_array(data = [1.0 -- 1.4142135623730951 --],
             mask = [False  True False  True],
       fill_value = 1e+20)

Advanced operations

Polynomials

for example, $3x^2 + 2x + 1$:

p = np.poly1d([3,2,-1])
p(0)

-1

p.roots

array([-1.        ,  0.33333333])

p.order

x = np.linspace(0,1,20)
y = np.cos(x) + 0.3*np.random.rand(20)
p = np.poly1d(np.polyfit(x,y,3))
t = np.linspace(0,1,200)
plt.plot(x,y,'o',t,p(t),'-')

[<matplotlib.lines.Line2D at 0xcb74e80>,
 <matplotlib.lines.Line2D at 0xcb74f60>]

png

Loading data files

pwd

u'C:\\Users\\zluck\\Documents\\Python'

cd C:\Users\zluck\Documents\Python\Python materials\scipy-lecture-notes-master\data

C:\Users\zluck\Documents\Python\Python materials\scipy-lecture-notes-master\data

data = np.loadtxt('populations.txt')
data

array([[  1900.,  30000.,   4000.,  48300.],
       [  1901.,  47200.,   6100.,  48200.],
       [  1902.,  70200.,   9800.,  41500.],
       [  1903.,  77400.,  35200.,  38200.],
       [  1904.,  36300.,  59400.,  40600.],
       [  1905.,  20600.,  41700.,  39800.],
       [  1906.,  18100.,  19000.,  38600.],
       [  1907.,  21400.,  13000.,  42300.],
       [  1908.,  22000.,   8300.,  44500.],
       [  1909.,  25400.,   9100.,  42100.],
       [  1910.,  27100.,   7400.,  46000.],
       [  1911.,  40300.,   8000.,  46800.],
       [  1912.,  57000.,  12300.,  43800.],
       [  1913.,  76600.,  19500.,  40900.],
       [  1914.,  52300.,  45700.,  39400.],
       [  1915.,  19500.,  51100.,  39000.],
       [  1916.,  11200.,  29700.,  36700.],
       [  1917.,   7600.,  15800.,  41800.],
       [  1918.,  14600.,   9700.,  43300.],
       [  1919.,  16200.,  10100.,  41300.],
       [  1920.,  24700.,   8600.,  47300.]])

np.savetxt('pop2.txt',data)
data2 = np.loadtxt('pop2.txt')

img = plt.imread('elephant.png')
img.shape,img.dtype

((200L, 300L, 3L), dtype('float32'))

plt.imshow(img)

<matplotlib.image.AxesImage at 0xc422cc0>

png

plt.savefig('plot.png')

<matplotlib.figure.Figure at 0xc3287b8>

plt.imsave('red_element',img[:,:,0],cmap = plt.cm.gray)

plt.imshow(plt.imread('red_elephant.png'))

<matplotlib.image.AxesImage at 0xcea10f0>

png