Python数据科学工具概览

Python是数据科学领域的主流语言，拥有丰富的数据处理和分析工具。

NumPy数值计算

NumPy核心特性

Python

import numpy as np

# 创建数组
arr = np.array([1, 2, 3, 4, 5])
arr2d = np.array([[1, 2], [3, 4]])

# 数组属性
arr.shape      # (5,)
arr.ndim       # 1
arr.dtype      # int64
arr.size       # 5

# 特殊数组
np.zeros((3, 4))        # 全零数组
np.ones((2, 3))         # 全一数组
np.arange(0, 10, 2)     # [0, 2, 4, 6, 8]
np.linspace(0, 1, 5)    # [0, 0.25, 0.5, 0.75, 1]

# 随机数组
np.random.rand(3, 4)    # 均匀分布
np.random.randn(3, 4)   # 正态分布
np.random.randint(0, 10, size=5)

数组运算

Python

import numpy as np

a = np.array([1, 2, 3])
b = np.array([4, 5, 6])

# 逐元素运算
a + b       # [5, 7, 9]
a * b       # [4, 10, 18]
a ** 2      # [1, 4, 9]

# 统计运算
a.sum()     # 6
a.mean()    # 2
a.std()     # 标准差
a.min()     # 1
a.max()     # 3

# 矩阵运算
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])

A @ B       # 矩阵乘法
A.T         # 转置
np.dot(A, B)  # 点积
np.linalg.inv(A)  # 求逆

数组索引与切片

Python

arr = np.arange(10).reshape(2, 5)

# 基本索引
arr[0, 1]       # 1
arr[0, :]       # 第一行
arr[:, 1]       # 第二列

# 条件索引
arr[arr > 5]    # 大于5的元素

# 布尔索引
mask = arr % 2 == 0
arr[mask]       # 偶数元素

Pandas数据处理

Series和DataFrame

Python

import pandas as pd

# Series（一维）
s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])

# DataFrame（二维）
df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, 30, 35],
    'city': ['NYC', 'LA', 'SF']
})

# 属性
df.shape       # (3, 3)
df.columns     # 列名
df.index       # 行索引
df.dtypes      # 数据类型

数据读取与写入

Python

import pandas as pd

# 读取CSV
df = pd.read_csv('data.csv')

# 读取Excel
df = pd.read_excel('data.xlsx')

# 读取JSON
df = pd.read_json('data.json')

# 读取SQL
df = pd.read_sql('SELECT * FROM users', connection)

# 写入文件
df.to_csv('output.csv', index=False)
df.to_excel('output.xlsx')
df.to_json('output.json')

数据选择与过滤

Python

# 列选择
df['name']               # 单列
df[['name', 'age']]      # 多列

# 行选择
df.loc[0]                # 按索引标签
df.iloc[0]               # 按位置
df.loc[0:2]              # 范围

# 条件过滤
df[df['age'] > 30]
df[(df['age'] > 25) & (df['city'] == 'NYC')]

# 查询方法
df.query('age > 30 and city == "NYC"')

数据处理

Python

# 数据清洗
df.dropna()              # 删除缺失值
df.fillna(0)             # 填充缺失值
df.drop_duplicates()     # 删除重复

# 数据转换
df['age'].astype(float)  # 类型转换
df['name'].str.upper()   # 字符串操作

# 数据排序
df.sort_values('age')    # 按列排序
df.sort_index()          # 按索引排序

# 数据统计
df.describe()            # 统计摘要
df['age'].value_counts() # 值计数
df.groupby('city').mean()# 分组聚合

# 数据合并
pd.concat([df1, df2])    # 拼接
pd.merge(df1, df2, on='id')  # 合并

Matplotlib可视化

基本绘图

Python

import matplotlib.pyplot as plt

# 简单折线图
plt.plot([1, 2, 3, 4], [1, 4, 9, 16])
plt.xlabel('X轴')
plt.ylabel('Y轴')
plt.title('折线图')
plt.show()

# 多条线
plt.plot(x, y1, label='Line 1')
plt.plot(x, y2, label='Line 2')
plt.legend()
plt.show()

# 样式设置
plt.plot(x, y, 'r--', linewidth=2, marker='o')
# 'r--' 红色虚线，marker='o' 圆点标记

图表类型

Python

import matplotlib.pyplot as plt

# 柱状图
plt.bar(['A', 'B', 'C'], [10, 20, 15])

# 直方图
plt.hist(data, bins=20)

# 散点图
plt.scatter(x, y, c='red', s=50)

# 饼图
plt.pie([30, 40, 30], labels=['A', 'B', 'C'])

# 箱线图
plt.boxplot(data)

# 子图
fig, axes = plt.subplots(2, 2)
axes[0, 0].plot(x, y)
axes[0, 1].bar(x, y)
axes[1, 0].scatter(x, y)
axes[1, 1].hist(y)
plt.show()

高级设置

Python

import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(10, 6))

ax.plot(x, y)
ax.set_title('Title', fontsize=14)
ax.set_xlabel('X', fontsize=12)
ax.grid(True)
ax.set_xlim(0, 10)
ax.set_ylim(0, 100)

# 保存图片
plt.savefig('figure.png', dpi=300)

Seaborn统计可视化

Seaborn绘图

Python

import seaborn as sns

# 散点图（带回归线）
sns.regplot(x='age', y='salary', data=df)

# 分布图
sns.distplot(df['age'])

# 箱线图
sns.boxplot(x='city', y='age', data=df)

# 小提琴图
sns.violinplot(x='city', y='age', data=df)

# 热力图（相关性矩阵）
sns.heatmap(df.corr(), annot=True)

# 成对关系图
sns.pairplot(df)

# 分类散点图
sns.catplot(x='city', y='age', hue='gender', data=df)

工具链组合应用

数据分析流程

Python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 1. 数据读取
df = pd.read_csv('sales.csv')

# 2. 数据清洗
df = df.dropna()
df['date'] = pd.to_datetime(df['date'])

# 3. 数据分析
monthly_sales = df.groupby(df['date'].dt.month)['amount'].sum()

# 4. 可视化
plt.figure(figsize=(10, 6))
plt.bar(monthly_sales.index, monthly_sales.values)
plt.xlabel('Month')
plt.ylabel('Sales')
plt.title('Monthly Sales')
plt.show()

# 5. 统计摘要
print(df.describe())
print(df.corr())

工具对比表

工具	核心功能	应用场景
NumPy	数值计算、矩阵运算	科学计算、底层支撑
Pandas	表格数据处理	数据清洗、分析
Matplotlib	基础可视化	定制图表
Seaborn	统计可视化	快速统计分析图
SciPy	科学计算扩展	数学、统计算法

安装与版本

Bash

# 安装数据科学工具
pip install numpy pandas matplotlib seaborn scipy

# 或使用Anaconda（推荐）
conda install numpy pandas matplotlib seaborn scipy

# 版本检查
import numpy as np
import pandas as pd
import matplotlib

print(np.__version__)    # 1.26.x
print(pd.__version__)    # 2.x
print(matplotlib.__version__)  # 3.x

注意：Pandas 2.x性能优化显著，推荐使用最新版本；Anaconda预装常用数据科学库。

要点总结

NumPy：高效数组运算、矩阵计算、广播机制、底层支撑库
Pandas：DataFrame表格处理、数据清洗、分组聚合、IO读写
Matplotlib：基础绑定制、高度可定制、支持多种图表类型
Seaborn：统计可视化、简化绑定制、美化默认样式
工具组合：NumPy基础→Pandas处理→Matplotlib/Seaborn可视化

存放路径：articles/PYTHON/专家/生态与工具链/数据科学工具概览.md

📝 发现内容有误？点击此处直接编辑