前提・実現したいこと
R言語を使ってmnistの手書き数字3と4について、3と4の数字を主成分分析で教師なし分類したいのですができません。片方の数字3だけを主成分分析で圧縮するコードを下に貼ります。
該当のソースコード
R
1library(ggplot2) 2library(dplyr) 3#install.packages("R.utils") 4library(R.utils) # unzip()を使う 5library(gclus) 6library(MASS) 7#install.packages("recommenderlab") 8library("recommenderlab") 9 10#download data from http://yann.lecun.com/exdb/mnist/ 11#download.file("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz", 12# "train-images-idx3-ubyte.gz") 13#download.file("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz", 14# "train-labels-idx1-ubyte.gz") 15#download.file("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz", 16# "t10k-images-idx3-ubyte.gz") 17#download.file("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz", 18# "t10k-labels-idx1-ubyte.gz") 19 20# gunzip the file 21#R.utils::gunzip("train-images-idx3-ubyte.gz") 22#R.utils::gunzip("train-labels-idx1-ubyte.gz") 23#R.utils::gunzip("t10k-images-idx3-ubyte.gz") 24#R.utils::gunzip("t10k-labels-idx1-ubyte.gz") 25 26# load image files 27load_image_file = function(filename) { 28 ret = list() 29 f = file(filename, 'rb') 30 readBin(f, 'integer', n = 1, size = 4, endian = 'big') 31 n = readBin(f, 'integer', n = 1, size = 4, endian = 'big') 32 nrow = readBin(f, 'integer', n = 1, size = 4, endian = 'big') 33 ncol = readBin(f, 'integer', n = 1, size = 4, endian = 'big') 34 x = readBin(f, 'integer', n = n * nrow * ncol, size = 1, signed = FALSE) 35 close(f) 36 data.frame(matrix(x, ncol = nrow * ncol, byrow = TRUE)) 37} 38 39# load label files 40load_label_file = function(filename) { 41 f = file(filename, 'rb') 42 readBin(f, 'integer', n = 1, size = 4, endian = 'big') 43 n = readBin(f, 'integer', n = 1, size = 4, endian = 'big') 44 y = readBin(f, 'integer', n = n, size = 1, signed = FALSE) 45 close(f) 46 y 47} 48 49# load images 50train = load_image_file("train-images-idx3-ubyte") 51test = load_image_file("t10k-images-idx3-ubyte") 52 53# load labels 54train$y = as.factor(load_label_file("train-labels-idx1-ubyte")) 55test$y = as.factor(load_label_file("t10k-labels-idx1-ubyte")) 56 57# helper function for visualization 58show_digit = function(arr784, col = gray(12:1 / 12), ...) { 59 image(matrix(as.matrix(arr784[-785]), nrow = 28)[, 28:1], col = col, ...) 60} 61 62# load image files 63load_image_file = function(filename) { 64 ret = list() 65 f = file(filename, 'rb') 66 readBin(f, 'integer', n = 1, size = 4, endian = 'big') 67 n = readBin(f, 'integer', n = 1, size = 4, endian = 'big') 68 nrow = readBin(f, 'integer', n = 1, size = 4, endian = 'big') 69 ncol = readBin(f, 'integer', n = 1, size = 4, endian = 'big') 70 x = readBin(f, 'integer', n = n * nrow * ncol, size = 1, signed = FALSE) 71 close(f) 72 data.frame(matrix(x, ncol = nrow * ncol, byrow = TRUE)) 73} 74 75# load label files 76load_label_file = function(filename) { 77 f = file(filename, 'rb') 78 readBin(f, 'integer', n = 1, size = 4, endian = 'big') 79 n = readBin(f, 'integer', n = 1, size = 4, endian = 'big') 80 y = readBin(f, 'integer', n = n, size = 1, signed = FALSE) 81 close(f) 82 y 83} 84 85# load images 86train = load_image_file("../input/mnistdt/train-images-idx3-ubyte") 87test = load_image_file("../input/mnistdt/t10k-images-idx3-ubyte") 88 89# load labels 90train$y = as.factor(load_label_file("../input/mnistdt/train-labels-idx1-ubyte")) 91test$y = as.factor(load_label_file("../input/mnistdt/t10k-labels-idx1-ubyte")) 92 93#数字3の最初の100個のデータ 94X <- train[train$y==3,][1:100,-785] 95#平均ベクトル 96mu.X = colMeans(X) 97show_digit( 255-mu.X) # 平均的な手書きの3の数字の図 98 99# 誤差Z 100Z <- t(apply(X, 1, function(x, m){x- m}, m= mu.X)) 101show_digit( Z[1,]) 102 103show_digit( Z[10,]) 104 105show_digit( Z[100,]) 106 107cov.Z <- cov(Z) 108dim(cov.Z) 109 110pca.Z <- eigen(cov.Z) 111 112show_digit( (255-X[1,])) # Fullの情報 113 114#k=50 115U.50 <- pca.Z$vectors[,1:50] 116Z1.50 <-t(pca.Z$vectors[,1:50])%*%Z[1,] %>% as.numeric 117UX.50 <- U.50%*%as.matrix(Z1.50,ncol=1) 118show_digit(255-(UX.50+as.matrix(mu.X))) 119 120#k=100 121U.100 <- pca.Z$vectors[,1:100] 122Z1.100 <-t(pca.Z$vectors[,1:100])%*%Z[1,] %>% as.numeric 123UX.100 <- U.100%*%as.matrix(Z1.100,ncol=1) 124show_digit(255- (UX.100+as.matrix(mu.X))) 125 126#k=150 127U.150 <- pca.Z$vectors[,1:150] 128Z1.150 <-t(pca.Z$vectors[,1:150])%*%Z[1,] %>% as.numeric 129UX.150 <- U.150%*%as.matrix(Z1.150,ncol=1) 130show_digit(255- (UX.150+as.matrix(mu.X))) 131#k=200 132U.200 <- pca.Z$vectors[,1:200] 133Z1.200 <-t(pca.Z$vectors[,1:200])%*%Z[1,] %>% as.numeric 134UX.200 <- U.200%*%as.matrix(Z1.200,ncol=1) 135show_digit(255-(UX.200+as.matrix(mu.X))) 136 137plot(1:784,pca.Z$values/sum(pca.Z$values),type="o",col=2, lwd=2, xlab="dimension",ylab="variance explained",cex=0.4) 138 139plot(1:784,cumsum(pca.Z$values)/sum(pca.Z$values),type="o",col=2, lwd=2, xlab="dimension",ylab="variance explained",cex=0.4) 140 141U <- pca.Z$vectors[,1:8] 142 143show_digit(255-U[,1]) 144show_digit(255-U[,2]) 145show_digit(255-U[,3]) 146show_digit(255-U[,4]) 147show_digit(255-U[,5]) 148show_digit(255-U[,6]) 149show_digit(255-U[,7]) 150show_digit(255-U[,8])
試したこと
上記の1つの数字を分類するコードで
X <- train[train$y==3,][1:100,-785] + train[train$y==4,][1:100,-785]
や
X <- train[train$y==3||4,][1:100,-785]
と試したりしましたが、3と4が混ざったような画像が出力されるだけで教師なし分類ができません。
補足情報(FW/ツールのバージョンなど)
kaggleのnotepad上のRエディタを使用しています。
回答1件
あなたの回答
tips
プレビュー