160 lines
6.5 KiB
R
Executable File
160 lines
6.5 KiB
R
Executable File
##Final R assignment in Intro to Statistics course, fall semster.
|
|
#+Written by Matan Horovitz (207130253) and Guy Amzaleg ()
|
|
#+We have chosen a dataset of CPU and GPU performance trends since 2000 - as published on Kaggle:
|
|
#+https://www.kaggle.com/datasets/michaelbryantds/cpu-and-gpu-product-data
|
|
|
|
chip <- read.csv("/home/shmick/Downloads/chip_dataset.csv")
|
|
#chip <- na.omit(chip)
|
|
##BONUS: convert from EPOCH: as.Date(as.POSIXct(1100171890,origin = "1970-01-01"))
|
|
#View(chip)
|
|
##For question 1, we have chosen to examine which type of chip has examined the greater improvement over the years - GPU chips or CPU chips.
|
|
#+As chip performance is most directly correlated with the number of transistors, we have measured the pace of development based on pace of
|
|
#+increasing transistor count.
|
|
CPU <- chip[chip$Type == 'CPU',]
|
|
#CPU <- subset(CPU, select= c(Product,Type,Release.Date,Process.Size..nm.,TDP..W.,Die.Size..mm.2.,Transistors..million.,Freq..MHz.))
|
|
GPU <- chip[chip$Type == 'GPU',]
|
|
#GPU <- subset(GPU, select= c(Product,Type,Release.Date,Process.Size..nm.,TDP..W.,Die.Size..mm.2.,Transistors..million.,Freq..MHz.))
|
|
#Calculate a crude 'performance factor' - the number of transistors multiplied by their frequency.
|
|
#CPU["Performance Factor"])
|
|
#Range of total transistor advancement
|
|
max(CPU$Transistors..million.,na.rm=TRUE) - min(CPU$Transistors..million.,na.rm=TRUE)
|
|
max(GPU$Transistors..million.,na.rm=TRUE) - min(GPU$Transistors..million.,na.rm=TRUE)
|
|
#Omit chips with missing data
|
|
#CPU <- na.omit(CPU)
|
|
#GPU <- na.omit(GPU)
|
|
##Iterate over date entries
|
|
#for (i in 1:length(CPU$Release.Date)){print(i)}
|
|
##Get date
|
|
##Install the 'lubridate' package to deal with conversion to EPOCH time
|
|
#install.packages('lubridate')
|
|
#library(lubridate)
|
|
#dates <- strptime(CPU$Release.Date,format="%Y-%m-%d")
|
|
#as.integer(as.POSIXct(CPU$Release.Date))
|
|
#posix_format_date <- c()
|
|
#or (date in 1:length(CPU$Release.Date)){
|
|
# cat("Date is", date)
|
|
# human_format_date <- CPU$Release.Date[date]
|
|
# print(human_format_date)
|
|
# posix_format_date[date] <- strptime(human_format_date,format="%Y-%m-%d")
|
|
#}
|
|
#for (i in CPU$Release.Date){
|
|
# print(i)
|
|
#}
|
|
|
|
|
|
##QUESTION 2: measure number of columns in our dataset and calculate a permutation and combination of
|
|
#+that number, minus two, and 3.
|
|
|
|
#Calculate total number of columns in our dataset
|
|
#n <- ncol(kernel_commits)
|
|
#View(n)
|
|
|
|
##QUESTION 3: pick two categorcial variables (Chip type, foundry) and see whether they're dependent
|
|
#+1. Probablity of chip type
|
|
#+2. Probability of foundry
|
|
#+3. Multiplty
|
|
|
|
#Sample 1 variable from 'Type' column
|
|
sampled_type <- sample(chip$Type,1)
|
|
#Count how many times it appears in it's column
|
|
p_sampled_type <- (length(which(chip$Type==sampled_type)))/length(chip$Type)
|
|
sampled_foundry <- sample(chip$Foundry,1)
|
|
p_sampled_foundry <- (length(which(chip$Foundry==sampled_foundry)))/length(chip$Foundry)
|
|
sampled_type_matrix <- chip[chip$Type == sampled_type,]
|
|
p_sampled_foundry_in_sampled_type <- (length(which(sampled_type_matrix$Foundry==sampled_foundry)))/length(sampled_type_matrix$Foundry)
|
|
p_sampled_chip_and_foundry <- p_sampled_foundry_in_sampled_type * p_sampled_type
|
|
|
|
if (p_sampled_chip_and_foundry == (p_sampled_type * p_sampled_foundry)){
|
|
print("Independent")
|
|
}else{
|
|
print("Dependent")
|
|
}
|
|
|
|
#Question 4 - 'Amazing'
|
|
GPU <- na.omit(GPU)
|
|
fp16_gflops <- na.omit(GPU$FP16.GFLOPS)
|
|
#Get total range of FP.16 GFLOPS
|
|
fp16_range <- as.numeric(sprintf("%.2f",(max(GPU$FP16.GFLOPS,na.rm=TRUE))-min(GPU$FP16.GFLOPS,na.rm=TRUE)))
|
|
fp16_low_threshold <- fp16_range / 3
|
|
fp16_medium_threshold <- fp16_low_threshold *2
|
|
#Create empty vector named 'amazing'
|
|
amazing <-c()
|
|
#Iterate over all numbers from 1 to the length of the vector
|
|
for (i in 1:length(fp16_gflops))
|
|
{
|
|
fp16_gflop <- fp16_gflops[i]
|
|
#If the number is greater or equal to 1 AND ALSO (&) smaller or equal to 3...
|
|
if(fp16_gflop <= fp16_low_threshold)
|
|
# ^ this bit is important
|
|
{
|
|
cat(fp16_gflop, "is low\n")
|
|
#Add "low" to list called 'Amazing'
|
|
amazing[i] <- "low"
|
|
#Once this condition is satisfied, move on to next item in loop (if on 1, move on to 2, etc)
|
|
next
|
|
}
|
|
#If the number is greater or equal to 3 AND ALSO (&) smaller or equal to 6...
|
|
else if(fp16_gflop > fp16_low_threshold & fp16_gflop <= fp16_medium_threshold )
|
|
# ^ this is like two IF's
|
|
{
|
|
cat(fp16_gflop, "is medium\n")
|
|
amazing[i] <- "medium"
|
|
next
|
|
} else if(fp16_gflop > fp16_medium_threshold) {
|
|
cat(fp16_gflop, "is high\n")
|
|
amazing[i] <- "high"
|
|
next
|
|
} else {
|
|
cat(fp16_gflop, "is unknown\n")
|
|
}
|
|
}
|
|
amazing
|
|
GPU["Amazing"] <- amazing
|
|
#Question 5
|
|
sorted_fp16_gflops <- sort(fp16_gflops)
|
|
fp16_gflops_length <- length(fp16_gflops)
|
|
#If the length of the sorted vector is divisble by 2...
|
|
if ((fp16_gflops_length %% 2) == 0) {
|
|
print("Dataset is even")
|
|
#... create a vector of the 2 middle elements...
|
|
fp16_gflops_medians <- c((fp16_gflops_length/2),((fp16_gflops_length/2)+1))
|
|
#... and calculate their average; that is the mean.
|
|
fp16_gflops_median <- mean(sorted_fp16_gflops[fp16_gflops_medians])
|
|
# ^ This is a vector of the 2 middle spots in our even vector
|
|
} else #< If the length of the sorted vector is odd...
|
|
{
|
|
print("Vector is odd")
|
|
#Get the index of the median number by adding 1 to the total count, and divide by half.
|
|
fp16_gflops_median_index <- (((fp16_gflops_length + 1)/2))
|
|
#The median is the number in the index we figured out earlier; pull it from the sorted vector.
|
|
fp16_gflops_median <- sorted_fp16_gflops[fp16_gflops_median_index]
|
|
}
|
|
cat("Median is:", fp16_gflops_median)
|
|
#Question 6
|
|
sampled_fp_32_gflops <- c()
|
|
for (i in 1:3){
|
|
cat("On ", i, "\n")
|
|
sampled_fp_32_gflop <- sample(chip$FP32.GFLOPS,1)
|
|
while (sampled_fp_32_gflop < 0 | is.na(sampled_fp_32_gflop))
|
|
{
|
|
cat("Sampled value ", sampled_fp_32_gflops, "is negative. Retrying...\n")
|
|
sampled_fp_32_gflop <- sample(chip$FP32.GFLOPS,1)
|
|
}
|
|
|
|
sampled_fp_32_gflops[i] <- sampled_fp_32_gflop
|
|
}
|
|
pnorm(sampled_fp_32_gflops[1],mean = sampled_fp_32_gflops[2], sd = sqrt(sampled_fp_32_gflops[3]))
|
|
|
|
#Question 7
|
|
fp64_gflops <- na.omit(GPU$FP64.GFLOPS)
|
|
mean(fp64_gflops)
|
|
var(fp64_gflops)
|
|
zscore <- (fp64_gflops - mean(fp64_gflops)) / sd(fp64_gflops)
|
|
#fp64_gflops_trans <- (fp64_gflops*2 + 16)
|
|
zscore_lin_trans <- ( ( (1/sd(fp64_gflops) * 2000 ) * fp64_gflops ) - ( mean(fp64_gflops)/sd(fp64_gflops) ) )
|
|
# ^ THIS is the linear transformation.
|
|
zscore_non_lin_trans <- ( ( (1/sd(fp64_gflops) * (fp64_gflops) ^ -0.7 ) * fp64_gflops ) - ( mean(fp64_gflops)/sd(fp64_gflops) ) )
|
|
|
|
plot(zscore_lin_trans,zscore_non_lin_trans,col = blue)
|
|
#plot(zscore,zscore_lin_trans)
|
|
doubled_zscore <- zscore * 2 |