kNN自体はもちろん知っていたが、理論的な考察についてはほとんど知らなかったので勉強になった。
面白かったところ
- 式を使ったボロノイ図の定義
- kNNとベイズ誤り率の関係
- kNNにおける次元の呪いの話
- 最近傍法とは?
- k最近傍法とは?
- ボロノイ図とは?
- kNNを使うときに問題となる次元の呪いについて説明せよ。
- 予測データと学習データの距離はどうなる?
- kNNの計算量は?
- kNNの計算量を減らすための工夫をいくつか説明せよ。
#include <iostream> #include <vector> #include <iomanip> using namespace std; // データを中心化する vector<double> centerize(const vector<double> &v) { int n = v.size(); double avg = 0.0; for (auto &x : v) avg += x; avg /= n; vector<double> w(n); for (int i = 0; i < n; i++) w[i] = v[i] - avg; return w; } // 分散を計算する double calc(const vector<double> &v) { int n = v.size(); double avg_sq = 0.0; double avg = 0.0; for (auto &x : v) { avg += x; avg_sq += x * x; } avg_sq /= n; avg /= n; return avg_sq - avg * avg; } // データ生成 vector<double> gen() { vector<double> v; for (int i = 0; i < 5; i++) v.push_back(1e10 + i); return v; } int main(int argc, char *argv[]) { vector<double> v = gen(); cout << fixed << setprecision(15); cout << calc(v) << endl; cout << calc(centerize(v)) << endl; return 0; }
$ ./main 0.000000000000000 2.000000000000000
using namespace std; #define ALL(x) (x).begin(), (x).end() #define EACH(itr,c) for(__typeof((c).begin()) itr=(c).begin(); itr!=(c).end(); itr++) #define FOR(i,b,e) for (int i=(int)(b); i<(int)(e); i++) #define MP(x,y) make_pair(x,y) #define REP(i,n) for(int i=0; i<(int)(n); i++) int n; long long sz[55][55]; double long sum[55][55]; double long sum_sq[55][55]; double long sum_prd[55][55]; double long w[55]; vector<int> child[55]; void dfs(int v) { REP (i, n+1) { sz[v][i] = 0; sum[v][i] = sum_sq[v][i] = sum_prd[v][i] = 0.0; } sz[v][1] = 1; sum[v][1] = w[v]; sum_sq[v][1] = w[v] * w[v]; for (auto &c : child[v]) { dfs(c); for (int i = n; i > 1; i--) { for (int j = 1; j < i; j++) { int k = i - j; sz[v][i] += sz[v][j] * sz[c][k]; sum[v][i] += sum[v][j] * sz[c][k] + sum[c][k] * sz[v][j]; sum_sq[v][i] += sum_sq[v][j] * sz[c][k] + sum_sq[c][k] * sz[v][j]; sum_prd[v][i] += sum_prd[v][j] * sz[c][k] + sum_prd[c][k] * sz[v][j] + sum[v][j] * sum[c][k]; } } } } class AverageVarianceSubtree { public: double average(vector<int> p, vector<int> weight) { n = weight.size(); double long avg = 0.0; REP (i, n) avg += weight[i]; avg /= n; REP (i, n) w[i] = weight[i] - avg; REP (i, n) child[i].clear(); REP (i, p.size()) child[p[i]].push_back(i+1); dfs(0); long long tot = 0; long double ret = 0.0; REP (v, n) { for (int i = 1; i <= n; i++) { tot += sz[v][i]; ret += sum_sq[v][i] / i - (sum_sq[v][i] + 2 * sum_prd[v][i]) / i / i; } } return ret / tot; } };
import numpy as np import pandas from matplotlib import pyplot as plt mean = [0.0, 0.0] sigma = [[1, 2], [2, 5]] data = np.random.multivariate_normal(mean, sigma, 300) x, y = data.T plt.plot(x, y, 'o')
df = pandas.DataFrame(data) u = df.mean() sigma = df.cov() lam, S = np.linalg.eig(sigma) assert np.linalg.norm(np.dot(np.dot(S.T, sigma), S) - np.diag(lam)) < 1e-9 whitening = np.dot(np.diag(lam**(-1/2)), S.T) data_whitened = np.dot(whitening, (df - u).T).T x, y = data_whitened.T plt.plot(x, y, 'o') Out[88]:
using namespace std; #define ALL(x) (x).begin(), (x).end() #define EACH(itr,c) for(__typeof((c).begin()) itr=(c).begin(); itr!=(c).end(); itr++) #define FOR(i,b,e) for (int i=(int)(b); i<(int)(e); i++) #define MP(x,y) make_pair(x,y) #define REP(i,n) for(int i=0; i<(int)(n); i++) int n; int m; int bd[100][100]; int be[100][100]; class BipartiteMatching { int V; vector<vector<int> >G; vector<int> match; vector<bool> used; bool dfs(int v) { used[v] = true; for (int i = 0; i < (int)G[v].size(); i++) { int u = G[v][i]; int w = match[u]; if (w < 0 || (!used[w] && dfs(w))) { match[v] = u; match[u] = v; return true; } } return false; } public: BipartiteMatching(int v_size) : V(v_size), G(V), match(V), used(V) {} void add_edge(int u, int v) { G[u].push_back(v); G[v].push_back(u); } int count() { int ret = 0; fill(match.begin(), match.end(), -1); for (int v = 0; v < V; v++) { if (match[v] < 0) { fill(used.begin(), used.end(), false); if (dfs(v)) ++ret; } } return ret; } int getPair(int v) { return match[v]; } }; void solve() { cin >> n >> m; memset(bd, 0, sizeof(bd)); REP (i, m) { char t; int y, x; cin >> t >> y >> x; --x, --y; if (t == 'x') bd[y][x] = 1; else if (t == '+') bd[y][x] = 2; else if (t == 'o') bd[y][x] = 3; } REP (i, n) REP (j, n) be[i][j] = bd[i][j]; // rook BipartiteMatching rook(2 * n); set<int> used; REP (i, n) REP (j, n) if (bd[i][j] & 1) { used.insert(i); used.insert(j + n); } REP (i, n) REP (j, n) if (!used.count(i) && !used.count(j+n)) rook.add_edge(i, j+n); rook.count(); REP (i, n) { int j = rook.getPair(i); if (j != -1) { be[i][j-n] |= 1; } } // bishop BipartiteMatching bishop(4*n); used.clear(); REP (i, n) REP (j, n) if (bd[i][j] & 2) { used.insert(n-1+i-j); used.insert(2*n-1+i+j); } REP (i, n) REP (j, n) { if (!used.count(n-1+i-j) && !used.count(2*n-1+i+j)) bishop.add_edge(n-1+i-j, 2*n-1+i+j); } bishop.count(); REP (i, n) REP (j, n) { int d1 = n-1+i-j; int d2 = 2*n-1+i+j; if (bishop.getPair(d1) == d2) be[i][j] |= 2; } // output score int score = 0; REP (i, n) REP (j, n) { if (be[i][j] & 1) ++score; if (be[i][j] & 2) ++score; } cout << score << " "; // output changed arrangement int cnt = 0; REP (i, n) REP (j, n) cnt += bd[i][j] != be[i][j]; cout << cnt << endl; REP (i, n) REP (j, n) { if (be[i][j] != bd[i][j]) { char t = be[i][j] == 3 ? 'o' : (be[i][j] == 1 ? 'x' : '+'); cout << t << " " << i+1 << " " << j+1 << endl; } } } int main() { ios_base::sync_with_stdio(0); int T; cin >> T; REP (i, T) { cerr << "Case #" << i+1 << ": " << endl; cout << "Case #" << i+1 << ": "; solve(); } return 0; }
data { int<lower=0> N; vector[N] x; vector[N] y; } parameters { real w; real b; real<lower=0> sigma; } model { y ~ normal(w * x + b, sigma); }
import numpy as np import pystan import matplotlib.pyplot as plt import pandas as pd def gen(): N = 30 w = 10 b = 3 sigma = 0.5 x = np.random.rand(N) err = sigma * np.random.randn(N) y = w * x + b + err return (x,y) def train(x, y): data = { 'N': len(x), 'x': x, 'y': y } fit = pystan.stan(file='linear_model.stan', data=data, iter=2000, warmup=1000, chains=4) return fit def predict(param, x): return param['w'] * x + param['b'] if __name__ == '__main__': x, y = gen() fit = train(x, y) fit.plot() plt.show() xt = np.linspace(0, 1, 100) params = pd.DataFrame({ 'w': fit.extract(permuted=True).get('w'), 'b': fit.extract(permuted=True).get('b') }) median_params = params.median() yt = predict(median_params, xt) yt_lower = [np.percentile(predict(params, x), 2.5) for x in xt] yt_upper = [np.percentile(predict(params, x), 97.5) for x in xt] plt.plot(x, y, 'bo') plt.plot(xt, yt, 'k') plt.fill_between(xt, yt_lower, yt_upper, facecolor='lightgrey', edgecolor='none') plt.show()
import numpy as np import seaborn as sns import matplotlib.pyplot as plt def bootstrap_unique_freq(n): x = np.random.randint(0, n, n) return len(set(x)) / n if __name__ == '__main__': t = 100 # iteration ns = [] # sample size xs = [] # unique sample freq for n in [10, 100, 1000, 10000]: ns.extend(n for _ in range(t)) xs.extend(bootstrap_unique_freq(n) for _ in range(t)) sns.boxplot(x=ns, y=xs) plt.show()