Home>

I'd like to use the R language to classify the numbers 3 and 4 in the mnist with the principal component analysis without supervising the numbers 3 and 4. Paste the code that compresses only one number 3 by principal component analysis.

Applicable source code
library (ggplot2)
library (dplyr)
# install.packages ("R.utils")
library (R.utils) # Use unzip ()
library (gclus)
library (MASS)
# install.packages ("recommenderlab")
library ("recommenderlab")
#download data from http://yann.lecun.com/exdb/mnist/
# download.file ("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz",
# "train-images-idx3-ubyte.gz")
# download.file ("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz",
# "train-labels-idx1-ubyte.gz")
# download.file ("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz",
# "t10k-images-idx3-ubyte.gz")
# download.file ("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz",
# "t10k-labels-idx1-ubyte.gz")
# gunzip the file
# R.utils :: gunzip ("train-images-idx3-ubyte.gz")
# R.utils :: gunzip ("train-labels-idx1-ubyte.gz")
# R.utils :: gunzip ("t10k-images-idx3-ubyte.gz")
# R.utils :: gunzip ("t10k-labels-idx1-ubyte.gz")
# load image files
load_image_file = function (filename) {
  ret = list ()
  f = file (filename, 'rb')
  readBin (f, 'integer', n = 1, size = 4, endian = 'big')
  n = readBin (f, 'integer', n = 1, size = 4, endian = 'big')
  nrow = readBin (f, 'integer', n = 1, size = 4, endian = 'big')
  ncol = readBin (f, 'integer', n = 1, size = 4, endian = 'big')
  x = readBin (f, 'integer', n = n * nrow * ncol, size = 1, signed = FALSE)
  close (f)
  data.frame (matrix (x, ncol = nrow * ncol, byrow = TRUE))
}
# load label files
load_label_file = function (filename) {
  f = file (filename, 'rb')
  readBin (f, 'integer', n = 1, size = 4, endian = 'big')
  n = readBin (f, 'integer', n = 1, size = 4, endian = 'big')
  y = readBin (f, 'integer', n = n, size = 1, signed = FALSE)
  close (f)
  y
}
# load images
train = load_image_file ("train-images-idx3-ubyte")test = load_image_file ("t10k-images-idx3-ubyte")
# load labels
train $y = as.factor (load_label_file ("train-labels-idx1-ubyte"))
test $y = as.factor (load_label_file ("t10k-labels-idx1-ubyte"))
# helper function for visualization
show_digit = function (arr784, col = gray (12: 1/12), ...) {
  image (matrix (as.matrix (arr784 [-785]), nrow = 28) [, 28: 1],

 col = col, ...)
}
# load image files
load_image_file = function (filename) {
  ret = list ()
  f = file (filename, 'rb')
  readBin (f, 'integer', n = 1, size = 4, endian = 'big')
  n = readBin (f, 'integer', n = 1, size = 4, endian = 'big')
  nrow = readBin (f, 'integer', n = 1, size = 4, endian = 'big')
  ncol = readBin (f, 'integer', n = 1, size = 4, endian = 'big')
  x = readBin (f, 'integer', n = n * nrow * ncol, size = 1, signed = FALSE)
  close (f)
  data.frame (matrix (x, ncol = nrow * ncol, byrow = TRUE))
}
# load label files
load_label_file = function (filename) {
  f = file (filename, 'rb')
  readBin (f, 'integer', n = 1, size = 4, endian = 'big')
  n = readBin (f, 'integer', n = 1, size = 4, endian = 'big')
  y = readBin (f, 'integer', n = n, size = 1, signed = FALSE)
  close (f)
  y
}
# load images
train = load_image_file ("../ input/mnistdt/train-images-idx3-ubyte")
test = load_image_file ("../ input/mnistdt/t10k-images-idx3-ubyte")
# load labels
train $y = as.factor (load_label_file ("../ input/mnistdt/train-labels-idx1-ubyte"))
test $y = as.factor (load_label_file ("../ input/mnistdt/t10k-labels-idx1-ubyte"))
#First 100 data of number 3
X<-train [train $y == 3,] [1: 100, -785]
#Average vector
mu.X = colMeans (X)
show_digit (255-mu.X) # Average handwritten 3 digit figure
# Error Z
Z<-t (apply (X, 1, function (x, m) {x- m},

 m = mu.X))
show_digit (Z [1,])
show_digit (Z [10,])show_digit (Z [100,])
cov.Z<-cov (Z)
dim (cov.Z)
pca.Z<-eigen (cov.Z)
show_digit ((255-X [1,])) # Full information
# k = 50
U.50<-pca.Z $vectors [, 1: 50]
Z1.50<-t (pca.Z $vectors [, 1: 50])% *% Z [1,]%>% as.numeric
UX.50<-U.50% *% as.matrix (Z1.50, ncol = 1)
show_digit (255- (UX.50 + as.matrix (mu.X)))
# k = 100
U.100<-pca.Z $vectors [, 1: 100]
Z1.100<-t (pca.Z $vectors [, 1: 100])% *% Z [1,]%>% as.numeric
UX.100<-U.100% *% as.matrix (Z1.100, ncol = 1)
show_digit (255- (UX.100 + as.matrix (mu.X)))
# k = 150
U.150<-pca.Z $vectors [, 1: 150]
Z1.150<-t (pca.Z $vectors [, 1: 150])% *% Z [1,]%>% as.numeric
UX.150<-U.150% *% as.matrix (Z1.150, ncol = 1)
show_digit (255- (UX.150 + as.matrix (mu.X)))
# k = 200
U.200<-pca.Z $vectors [, 1: 200]
Z1.200<-t (pca.Z $vectors [, 1: 200])% *% Z [1,]%>% as.numeric
UX.200<-U.200% *% as.matrix (Z1.200, ncol = 1)
show_digit (255- (UX.200 + as.matrix (mu.X)))
plot (1: 784, pca.Z $values ​​/ sum (pca.Z $values), type = "o", col = 2, lwd = 2, xlab = "dimension", ylab = "variance explained", cex = 0.4)
plot (1: 784, cumsum (pca.Z $values)/sum (pca.Z $values), type = "o", col = 2, lwd = 2, xlab = "dimension", ylab = "variance explained" , cex = 0.4)
U<-pca.Z $vectors [, 1: 8]
show_digit (255-U [, 1])
show_digit (255-U [, 2])
show_digit (255-U [, 3])
show_digit (255-U [, 4])
show_digit (255-U [, 5])
show_digit (255-U [, 6])
show_digit (255-U [, 7])
show_digit (255-U [, 8])

A code that classifies one of the numbers above

X<-train [train $y == 3,] [1: 100, -785] + train [train $y == 4,] [1: 100, -785]


Or

X<-train [train $y == 3 || 4,] [1: 100, -785]


I tried it, but the unsupervised classification is not possible just by outputting an image that 3 and 4 are mixed.

Supplemental information (FW/tool version etc.)

I use the R editor on kaggle's notepad.