Graduate School/Mathematics for AI
Exploratory data analysis and visualization
- -
728x90
반응형
In [1]:
!pip install -q --upgrade matplotlib
/bin/bash: pip: command not found
들어가기¶
‘통계학자’ 나이팅게일의 ‘로즈 다이어그램’
In [2]:
from packaging import version
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline
import numpy as np
import pandas as pd
assert version.parse(mpl.__version__) >= version.Version("3.5"), (
"에러가 난다면 첫 셀을 실행하고 ‘Runtime’ -> ‘Restart Runtime’를 눌러 주세요."
)
matplotlib_inline.backend_inline.set_matplotlib_formats("png2x")
mpl.style.use("default")
mpl.rcParams.update({"figure.constrained_layout.use": True})
In [3]:
# 유니코드 마이너스 기호가 포함된 한글 글꼴 불러오기
# Pretendard https://cactus.tistory.com/306
!wget -q -nc https://github.com/orioncactus/pretendard/releases/download/v1.3.3/Pretendard-1.3.3.zip
!unzip -nq Pretendard-1.3.3.zip -d pretendard
pretendard = mpl.font_manager.FontEntry(
fname="pretendard/public/static/Pretendard-Regular.otf",
name="Pretendard",
)
mpl.font_manager.fontManager.ttflist.append(pretendard)
mpl.rcParams.update({"font.family": ["Pretendard"] + mpl.rcParamsDefault["font.family"]})
# Datasaurus Dozen
# - https://www.autodesk.com/research/publications/same-stats-different-graphs
# - https://www.openintro.org/data/index.php?data=datasaurus
!wget -q -nc https://www.openintro.org/data/csv/datasaurus.csv
datasaurus = {
dataset: df[["x", "y"]].reset_index(drop=True)
for dataset, df in pd.read_csv("datasaurus.csv").groupby("dataset")
}
# Iris
# - https://gist.github.com/netj/8836201/
iris = pd.read_csv(
"https://gist.githubusercontent.com/netj/8836201/raw/"
"6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv"
).rename(columns=lambda s: s.replace(".", "_"))
# MNIST
import tensorflow as tf
(mnist_x_train, mnist_y_train), (mnist_x_test, mnist_y_test) = (
tf.keras.datasets.mnist.load_data()
)
# CIFAR10
(cifar10_x_train, cifar10_y_train), (cifar10_x_test, cifar10_y_test) = (
tf.keras.datasets.cifar10.load_data()
)
cifar10_class_names = [
"airplane",
"automobile",
"bird",
"cat",
"deer",
"dog",
"frog",
"horse",
"ship",
"truck",
]
예제¶
In [4]:
datasaurus.keys()
Out[4]:
dict_keys(['away', 'bullseye', 'circle', 'dino', 'dots', 'h_lines', 'high_lines', 'slant_down', 'slant_up', 'star', 'v_lines', 'wide_lines', 'x_shape'])
In [5]:
datasaurus["away"]
Out[5]:
x | y | |
---|---|---|
0 | 32.331110 | 61.411101 |
1 | 53.421463 | 26.186880 |
2 | 63.920202 | 30.832194 |
3 | 70.289506 | 82.533649 |
4 | 34.118830 | 45.734551 |
... | ... | ... |
137 | 59.851838 | 72.958391 |
138 | 48.960460 | 72.629526 |
139 | 46.844855 | 36.791714 |
140 | 39.963022 | 42.944915 |
141 | 66.704944 | 32.015095 |
142 rows × 2 columns
In [6]:
datasaurus["away"].info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 142 entries, 0 to 141 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 x 142 non-null float64 1 y 142 non-null float64 dtypes: float64(2) memory usage: 2.3 KB
In [7]:
datasaurus["away"].describe()
Out[7]:
x | y | |
---|---|---|
count | 142.000000 | 142.000000 |
mean | 54.266100 | 47.834721 |
std | 16.769825 | 26.939743 |
min | 15.560750 | 0.015119 |
25% | 39.724115 | 24.625892 |
50% | 53.340296 | 47.535269 |
75% | 69.146597 | 71.803148 |
max | 91.639961 | 97.475771 |
In [8]:
pd.DataFrame(
{
name: pd.concat((df.mean().add_prefix("mean_"), df.std().add_prefix("std_")))
for name, df in datasaurus.items()
}
).style.format("{:.3f}")
Out[8]:
away | bullseye | circle | dino | dots | h_lines | high_lines | slant_down | slant_up | star | v_lines | wide_lines | x_shape | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
mean_x | 54.266 | 54.269 | 54.267 | 54.263 | 54.260 | 54.261 | 54.269 | 54.268 | 54.266 | 54.267 | 54.270 | 54.267 | 54.260 |
mean_y | 47.835 | 47.831 | 47.838 | 47.832 | 47.840 | 47.830 | 47.835 | 47.836 | 47.831 | 47.840 | 47.837 | 47.832 | 47.840 |
std_x | 16.770 | 16.769 | 16.760 | 16.765 | 16.768 | 16.766 | 16.767 | 16.767 | 16.769 | 16.769 | 16.770 | 16.770 | 16.770 |
std_y | 26.940 | 26.936 | 26.930 | 26.935 | 26.930 | 26.940 | 26.940 | 26.936 | 26.939 | 26.930 | 26.938 | 26.938 | 26.930 |
In [9]:
fig, axes = plt.subplots(3, 4, figsize=(8, 6), sharex=True, sharey=True)
for (dataset, df), ax in zip(datasaurus.items(), axes.flat):
ax.scatter(df["x"], df["y"], alpha=0.5)
ax.set_title(dataset)
fig.suptitle("Datasaurus 각 테이블별 산점도")
fig.supxlabel("x")
fig.supylabel("y")
pass
Matplotlib¶
In [10]:
# 데이터
x = np.linspace(-2, 2)
x_squared = 0.5 * x ** 2
cosine_x = np.cos(2 * x)
# 시각화
fig, ax = plt.subplots(figsize=(4.5, 4.5))
ax.plot(x, x, label=r"$y = x$")
ax.plot(x, x_squared, label=r"$y = \frac{1}{2} x^2$")
ax.plot(x, cosine_x, label=r"$y = \cos 2x$")
ax.set(xlabel=r"$x$", ylabel=r"$y$", title="함수의 그래프", aspect="equal")
ax.grid()
ax.legend()
pass
Matplotlib을 이용한 시각화¶
In [11]:
# 데이터
t = np.linspace(0, 4 * np.pi, 20)
y_blue = np.sin(t)
y_red = t ** 2 / 100
y_green = np.sin(1.5 * t) - 0.1 * t
# 선 그래프 그리기
fig, ax = plt.subplots()
ax.plot(t, y_blue, marker="o", ls="-", c="tab:blue", label="파란 공")
ax.plot(t, y_red, marker="^", ls="--", c="tab:red", label="빨간 공")
ax.plot(t, y_green, marker="s", ls=":", c="tab:green", label="초록 공")
ax.set(
xlabel="시간 (second)",
ylabel="위치 (cm)",
title="공의 위치",
)
# 또는
# ax.set_xlabel("시간 (second)")
# ax.set_ylabel("위치 (cm)")
# ax.set_title("공의 위치")
ax.legend()
pass
In [12]:
fig, ax = plt.subplots(figsize=(6, 1))
ax.plot([0, 1], [0, 1], label="label")
ax.legend() # <- Legend의 인스턴스를 반환
# Jupyter notebook에서 마지막에 반환된 것을 출력함
Out[12]:
<matplotlib.legend.Legend at 0x29a3c6df0>
In [13]:
fig, ax = plt.subplots(figsize=(6, 1))
ax.plot([0, 1], [0, 1], label="label")
ax.legend();
In [14]:
fig, ax = plt.subplots(figsize=(6, 1))
ax.plot([0, 1], [0, 1], label="label")
ax.legend()
pass
In [15]:
iris
Out[15]:
sepal_length | sepal_width | petal_length | petal_width | variety | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | Setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | Setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | Setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | Setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | Setosa |
... | ... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | 5.2 | 2.3 | Virginica |
146 | 6.3 | 2.5 | 5.0 | 1.9 | Virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | Virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | Virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | Virginica |
150 rows × 5 columns
In [16]:
fig, ax = plt.subplots(figsize=(4, 4))
feature_x = "sepal_length"
feature_y = "petal_length"
feature_additional = "petal_width"
ax.scatter(iris[feature_x], iris[feature_y], alpha=0.7)
ax.set(
xlabel=feature_x,
ylabel=feature_y,
title="Iris",
)
pass
In [17]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4), sharex=True, sharey=True)
feature_x = "sepal_length"
feature_y = "petal_length"
feature_additional = "petal_width"
marker_size_factor = (
plt.rcParams["lines.markersize"] ** 2 / iris[feature_additional].mean()
)
pc = ax1.scatter(
iris[feature_x],
iris[feature_y],
s=marker_size_factor * iris[feature_additional],
alpha=0.7,
)
ax1.legend(
*pc.legend_elements(prop="sizes", func=lambda s: s / marker_size_factor),
title=feature_additional,
)
pc = ax2.scatter(
iris[feature_x],
iris[feature_y],
c=iris[feature_additional],
cmap="viridis",
alpha=0.7,
)
colorbar = fig.colorbar(pc, label=feature_additional)
fig.suptitle("Iris")
fig.supxlabel(feature_x)
fig.supylabel(feature_y)
pass
In [18]:
rng = np.random.default_rng(78)
n = 300
x = rng.normal(scale=2, size=n).round(decimals=0)
y = rng.normal(scale=2, size=n).round(decimals=0)
fig, axes = plt.subplots(1, 3, figsize=(7.5, 2.5), sharex=True, sharey=True)
axes[0].scatter(x, y)
axes[0].set_title("아무 처리도 안 한 산점도")
noise_strength = 0.15
x_with_noise = x + rng.normal(scale=noise_strength, size=x.shape)
y_with_noise = y + rng.normal(scale=noise_strength, size=y.shape)
axes[1].scatter(x_with_noise, y_with_noise, alpha=0.3)
axes[1].set_title("노이즈를 추가한 산점도")
hb = axes[2].hexbin(x, y, gridsize=8)
fig.colorbar(hb, label="빈도")
axes[2].set_title("육각형 2D 히스토그램")
axes[2].spines[:].set_visible(False)
for ax in axes.flat:
ax.set_aspect("equal")
fig.supxlabel("x")
fig.supylabel("y")
pass
In [19]:
film_title = [
"명량",
"극한직업",
"신과함께-죄와 벌",
"국제시장",
"어벤져스: 엔드게임",
"겨울왕국 2",
"베테랑",
"아바타",
]
admissions = [
17_613_682,
16_264_944,
14_410_754,
14_257_115,
13_934_592,
13_747_792,
13_414_372,
13_338_863,
]
In [20]:
from matplotlib.ticker import FuncFormatter
admission_formatter = FuncFormatter(lambda x, pos: f"{x / 10000:,.0f}만")
fig, ax = plt.subplots()
bc = ax.bar(film_title, admissions)
ax.xaxis.set_tick_params(length=0)
ax.yaxis.set_tick_params(length=0)
ax.yaxis.set_major_formatter(admission_formatter)
ax.set(
ylabel="관객수 (명)",
title="대한민국의 영화 흥행 기록",
)
ax.spines[["top", "right", "left"]].set_visible(False)
ax.grid(axis="y")
In [21]:
fig, ax = plt.subplots()
ax.bar(film_title, admissions)
ax.xaxis.set_tick_params(length=0)
ax.yaxis.set_tick_params(length=0)
ax.yaxis.set_major_formatter(admission_formatter)
ax.set(
ylabel="관객수 (명)",
title="대한민국의 영화 흥행 기록",
)
plt.setp(ax.get_xticklabels(), ha="right", rotation=45)
ax.spines[["top", "right", "left"]].set_visible(False)
ax.grid(axis="y")
In [22]:
fig, ax = plt.subplots()
ax.barh(film_title, admissions[::-1])
ax.xaxis.set_tick_params(length=0)
ax.yaxis.set_tick_params(length=0)
ax.xaxis.set_major_formatter(admission_formatter)
# DO NOT DO THIS
# ax.set_xlim(left=1.3e7)
ax.set(
xlabel="관객수 (명)",
title="대한민국의 영화 흥행 기록",
)
ax.spines[["top", "right", "bottom"]].set_visible(False)
ax.grid(axis="x")
In [23]:
rng = np.random.default_rng(78)
n = 200
x = np.concatenate((rng.normal(loc=0, size=n), rng.normal(loc=4, size=n)))
fig, axes = plt.subplots(1, 3, figsize=(9, 3), sharex=True, sharey=True)
for bins, ax in zip([6, 20, 100], axes.flat):
ax.hist(x, bins=bins, density=True)
ax.set_title(f"bins = {bins}", loc="left")
fig.suptitle("Histogram of x")
fig.supxlabel("Value of x")
fig.supylabel("Probability")
pass
In [24]:
target_feature = "sepal_length"
iris_groupby_variety = {
name: df[target_feature].to_numpy() for name, df in iris.groupby("variety")
}
fig, ax = plt.subplots()
ax.boxplot(iris_groupby_variety.values(), labels=iris_groupby_variety.keys())
ax.spines[["top", "right", "left"]].set_visible(False)
ax.yaxis.set_tick_params(length=0)
ax.grid(axis="y")
ax.set(
ylabel=target_feature,
title="Iris 데이터 종별 상자 수염 그림",
)
pass
In [25]:
fig, ax = plt.subplots()
target_variety = "Virginica"
data = iris_groupby_variety[target_variety]
ax.boxplot(data, labels=[target_variety])
ax.set(
title="상자 수염 그림",
ylabel=target_feature,
)
ax.spines[["top", "right", "left"]].set_visible(False)
ax.grid(axis="y")
annotation_options = {
"va": "center",
"arrowprops": {"arrowstyle": "->"},
"bbox": {"fc": "whitesmoke", "ec": "gray"},
}
quantiles = np.quantile(data, [0, 0.25, 0.5, 0.75, 1])
for name, q in zip(["min", "Q1", "median", "Q3", "max"], quantiles):
ax.annotate(
f"{name}: {q:.3f}",
xy=(0.9, q),
xytext=(0.75, q),
**annotation_options,
ha="right",
)
iqr = quantiles[3] - quantiles[1]
for name, q in [
["mean", data.mean().item()],
["Q3 + 1.5 × IQR", quantiles[3] + 1.5 * iqr],
["Q1 - 1.5 × IQR", quantiles[1] - 1.5 * iqr],
]:
ax.annotate(
f"{name}: {q:.3f}",
xy=(1.1, q),
xytext=(1.25, q),
**annotation_options,
ha="left",
)
In [26]:
print(f"cifar10_x_train.shape = {cifar10_x_train.shape}")
print(f"cifar10_x_train.dtype = {cifar10_x_train.dtype}")
print(f"cifar10_x_train.min(), .max() = {cifar10_x_train.min(), cifar10_x_train.max()}")
cifar10_x_train.shape = (50000, 32, 32, 3) cifar10_x_train.dtype = uint8 cifar10_x_train.min(), .max() = (0, 255)
In [27]:
fig, axes = plt.subplots(2, 5, figsize=(6, 3))
for matrix, label, ax in zip(cifar10_x_train, cifar10_y_train, axes.flat):
ax.imshow(matrix)
ax.set_axis_off()
ax.set_title(cifar10_class_names[label.item()])
fig.suptitle(f"CIFAR10 훈련 세트트첫 {axes.size}개 샘플")
pass
In [28]:
sample = cifar10_x_train[6]
fig, axes = plt.subplots(1, 3, figsize=(4, 2))
axes[0].imshow(sample)
axes[0].set_title("원본")
axes[1].imshow(sample // 2)
axes[1].set_title("어둡게")
axes[2].imshow(sample // 2 + 128)
axes[2].set_title("밝게")
for ax in axes.flat:
ax.set_axis_off()
In [29]:
print(f"mnist_x_train.shape = {mnist_x_train.shape}")
print(f"mnist_x_train.dtype = {mnist_x_train.dtype}")
print(f"mnist_x_train.min(), .max() = {mnist_x_train.min(), mnist_x_train.max()}")
mnist_x_train.shape = (60000, 28, 28) mnist_x_train.dtype = uint8 mnist_x_train.min(), .max() = (0, 255)
In [30]:
fig, axes = plt.subplots(2, 5, figsize=(6, 3))
for matrix, label, ax in zip(mnist_x_train, mnist_y_train, axes.flat):
ax.imshow(matrix)
ax.set_axis_off()
ax.set_title(label)
fig.suptitle(f"MNIST 훈련 세트 첫 {axes.size}개 샘플")
pass
In [31]:
sample = mnist_x_train[7]
fig, axes = plt.subplots(1, 3, figsize=(4, 2))
axes[0].imshow(sample)
axes[0].set_title("원본")
axes[1].imshow(sample // 2)
axes[1].set_title("어둡게???")
axes[2].imshow(sample // 2 + 128)
axes[2].set_title("밝게???")
for ax in axes.flat:
ax.set_axis_off()
In [32]:
sample = mnist_x_train[7]
fig, axes = plt.subplots(1, 3, figsize=(7, 2))
im = axes[0].imshow(sample)
axes[0].set_title("원본")
fig.colorbar(im, ax=axes[0])
im = axes[1].imshow(sample // 2)
axes[1].set_title("어둡게???")
fig.colorbar(im, ax=axes[1])
im = axes[2].imshow(sample // 2 + 128)
axes[2].set_title("밝게???")
fig.colorbar(im, ax=axes[2])
for ax in axes.flat:
ax.set_axis_off()
In [33]:
def imshow_test(x):
fig, axes = plt.subplots(1, 4, figsize=(5, 2))
axes.flat[0].imshow(x, cmap="gray", vmin=0, vmax=255)
axes.flat[0].set_title("gray + vrange")
# Force broadcast to (H, W, 3)
axes.flat[1].imshow(np.tile(x[..., np.newaxis], (1, 1, 3)))
axes.flat[1].set_title("강제 RGB 변환")
axes.flat[2].imshow(x)
axes.flat[2].set_title("기본값(viridis)")
axes.flat[3].imshow(x, cmap="gray")
axes.flat[3].set_title("gray")
for ax in axes.flat:
ax.set_axis_off()
return fig, axes
sample = mnist_x_train[7]
fig_sample, _ = imshow_test(sample)
fig_sample.suptitle("원본")
fig_dark, _ = imshow_test(sample // 2)
fig_dark.suptitle("어둡게")
fig_light, _ = imshow_test(sample // 2 + 128)
fig_light.suptitle("밝게")
pass
색깔 선택¶
In [34]:
x = np.linspace(0, 3 * np.pi)
y1 = np.sin(x) + 5
y2 = np.sin(x - 1) + 5
y3 = np.sin(x - 2) + 5
fig, axes = plt.subplots(3, 1, figsize=(6, 6))
color_cycles = [
["#FF0000", "#00FF00", "#00FFFF"],
["red", "green", "cyan"],
["tab:red", "tab:green", "tab:cyan"],
]
for color_cycle, ax in zip(color_cycles, axes.flat):
ax.plot(x, y1, c=color_cycle[0], label=color_cycle[0])
ax.plot(x, y2, c=color_cycle[1], label=color_cycle[1])
ax.plot(x, y3, c=color_cycle[2], label=color_cycle[2])
ax.legend()
In [35]:
x = np.linspace(0, 5, 100)
y = np.linspace(0, 5, 90)
X, Y = np.meshgrid(x, y)
Z = np.sin(X) ** 5 + np.cos(X) * np.cos(Y)
fig = plt.figure()
subfigs = fig.subfigures(2, 2)
for cmap, subfig in zip(["viridis", "turbo", "jet", "hsv"], subfigs.flat):
ax = subfig.subplots()
image = ax.imshow(Z, extent=[x[0], x[-1], y[0], y[-1]], origin="lower", cmap=cmap)
subfig.colorbar(image, ax=ax)
ax.set(
xlabel="x",
ylabel="y",
title=cmap,
)
pass
In [36]:
features = iris.columns[:-1]
corr = iris[features].corr().to_numpy()
mask = np.triu(np.ones_like(corr), k=1)
corr_tril = np.ma.array(corr, mask=mask)
def iris_corr(**imshow_kws):
fig, ax = plt.subplots(figsize=(4, 3))
im = ax.imshow(corr_tril, **imshow_kws)
plt.colorbar(im, ax=ax)
ax.set_xticks(range(corr.shape[0]), features)
ax.set_yticks(range(corr.shape[0]), features)
ax.set_title("Iris correlation")
plt.setp(ax.get_xticklabels(), ha="center", va="top", rotation=90)
return fig, ax
iris_corr()
iris_corr(cmap="RdBu")
iris_corr(cmap="RdBu", vmin=-1, vmax=1)
pass
In [37]:
x = np.linspace(0, 3 * np.pi)
y1 = np.sin(x) + 5
y2 = np.sin(x - 1) + 5
y3 = np.sin(x - 2) + 5
for theme in ["default", "classic", "dark_background", "fivethirtyeight", "seaborn"]:
with plt.style.context(theme):
fig, ax = plt.subplots(figsize=(5, 2), constrained_layout=True)
ax.plot(x, y1, label="a")
ax.plot(x, y2, label="b")
ax.plot(x, y3, label="c")
ax.legend()
ax.set_title(f"Theme: {theme}")
with plt.xkcd():
fig, ax = plt.subplots(figsize=(5, 2), constrained_layout=True)
ax.plot(x, y1, label="a")
ax.plot(x, y2, label="b")
ax.plot(x, y3, label="c")
ax.legend()
ax.set_title(f"For fun: xkcd style (not a theme)")
파일로 저장하기¶
In [38]:
x = np.linspace(0, 5, 200)
y = np.linspace(0, 5, 199)
X, Y = np.meshgrid(x, y)
Z = np.sin(X) ** 5 + np.cos(X) * np.cos(Y)
cm_per_inch = 2.54
fig, ax = plt.subplots(figsize=(12 / cm_per_inch, 9 / cm_per_inch))
contours = ax.contour(X, Y, Z, 3, colors="black", levels=5)
ax.clabel(contours, inline=True, fontsize=8)
vlim = np.abs(Z).max()
im = ax.imshow(
Z,
extent=[x[0], x[-1], y[0], y[-1]],
origin="lower",
vmin=-vlim,
vmax=vlim,
cmap="RdBu",
alpha=0.5,
)
ax.set(
xlabel=r"$x$",
ylabel=r"$y$",
title=r"$\sin^5(x) + \cos(x) \cos(y)$",
)
fig.colorbar(im)
fig.savefig("image.png")
fig.savefig("image-transparent.png", transparent=True)
fig.savefig("image-600dpi.png", dpi=600)
fig.savefig("image.jpg")
fig.savefig("image.svg")
fig.savefig("image.pdf")
728x90
반응형
'Graduate School > Mathematics for AI' 카테고리의 다른 글
Markov Decision Process Example (0) | 2024.09.10 |
---|---|
IBM HR data Binary Classification (0) | 2024.09.10 |
Contents
소중한 공감 감사합니다