##Final R assignment in Intro to Statistics course, fall semster. #+Written by Matan Horovitz (207130253) and Guy Amzaleg () #+We have chosen a dataset of CPU and GPU performance trends since 2000 - as published on Kaggle: #+https://www.kaggle.com/datasets/michaelbryantds/cpu-and-gpu-product-data chip <- read.csv("/home/shmick/Downloads/chip_dataset.csv") #chip <- na.omit(chip) ##BONUS: convert from EPOCH: as.Date(as.POSIXct(1100171890,origin = "1970-01-01")) #View(chip) ##For question 1, we have chosen to examine which type of chip has examined the greater improvement over the years - GPU chips or CPU chips. #+As chip performance is most directly correlated with the number of transistors, we have measured the pace of development based on pace of #+increasing transistor count. CPU <- chip[chip$Type == 'CPU',] #CPU <- subset(CPU, select= c(Product,Type,Release.Date,Process.Size..nm.,TDP..W.,Die.Size..mm.2.,Transistors..million.,Freq..MHz.)) GPU <- chip[chip$Type == 'GPU',] #GPU <- subset(GPU, select= c(Product,Type,Release.Date,Process.Size..nm.,TDP..W.,Die.Size..mm.2.,Transistors..million.,Freq..MHz.)) #Calculate a crude 'performance factor' - the number of transistors multiplied by their frequency. #CPU["Performance Factor"]) #Range of total transistor advancement max(CPU$Transistors..million.,na.rm=TRUE) - min(CPU$Transistors..million.,na.rm=TRUE) max(GPU$Transistors..million.,na.rm=TRUE) - min(GPU$Transistors..million.,na.rm=TRUE) #Omit chips with missing data #CPU <- na.omit(CPU) #GPU <- na.omit(GPU) ##Iterate over date entries #for (i in 1:length(CPU$Release.Date)){print(i)} ##Get date ##Install the 'lubridate' package to deal with conversion to EPOCH time #install.packages('lubridate') #library(lubridate) #dates <- strptime(CPU$Release.Date,format="%Y-%m-%d") #as.integer(as.POSIXct(CPU$Release.Date)) #posix_format_date <- c() #or (date in 1:length(CPU$Release.Date)){ # cat("Date is", date) # human_format_date <- CPU$Release.Date[date] # print(human_format_date) # posix_format_date[date] <- strptime(human_format_date,format="%Y-%m-%d") #} #for (i in CPU$Release.Date){ # print(i) #} ##QUESTION 2: measure number of columns in our dataset and calculate a permutation and combination of #+that number, minus two, and 3. #Calculate total number of columns in our dataset #n <- ncol(kernel_commits) #View(n) ##QUESTION 3: pick two categorcial variables (Chip type, foundry) and see whether they're dependent #+1. Probablity of chip type #+2. Probability of foundry #+3. Multiplty #Sample 1 variable from 'Type' column sampled_type <- sample(chip$Type,1) #Count how many times it appears in it's column p_sampled_type <- (length(which(chip$Type==sampled_type)))/length(chip$Type) sampled_foundry <- sample(chip$Foundry,1) p_sampled_foundry <- (length(which(chip$Foundry==sampled_foundry)))/length(chip$Foundry) sampled_type_matrix <- chip[chip$Type == sampled_type,] p_sampled_foundry_in_sampled_type <- (length(which(sampled_type_matrix$Foundry==sampled_foundry)))/length(sampled_type_matrix$Foundry) p_sampled_chip_and_foundry <- p_sampled_foundry_in_sampled_type * p_sampled_type if (p_sampled_chip_and_foundry == (p_sampled_type * p_sampled_foundry)){ print("Independent") }else{ print("Dependent") } #Question 4 - 'Amazing' GPU <- na.omit(GPU) fp16_gflops <- na.omit(GPU$FP16.GFLOPS) #Get total range of FP.16 GFLOPS fp16_range <- as.numeric(sprintf("%.2f",(max(GPU$FP16.GFLOPS,na.rm=TRUE))-min(GPU$FP16.GFLOPS,na.rm=TRUE))) fp16_low_threshold <- fp16_range / 3 fp16_medium_threshold <- fp16_low_threshold *2 #Create empty vector named 'amazing' amazing <-c() #Iterate over all numbers from 1 to the length of the vector for (i in 1:length(fp16_gflops)) { fp16_gflop <- fp16_gflops[i] #If the number is greater or equal to 1 AND ALSO (&) smaller or equal to 3... if(fp16_gflop <= fp16_low_threshold) # ^ this bit is important { cat(fp16_gflop, "is low\n") #Add "low" to list called 'Amazing' amazing[i] <- "low" #Once this condition is satisfied, move on to next item in loop (if on 1, move on to 2, etc) next } #If the number is greater or equal to 3 AND ALSO (&) smaller or equal to 6... else if(fp16_gflop > fp16_low_threshold & fp16_gflop <= fp16_medium_threshold ) # ^ this is like two IF's { cat(fp16_gflop, "is medium\n") amazing[i] <- "medium" next } else if(fp16_gflop > fp16_medium_threshold) { cat(fp16_gflop, "is high\n") amazing[i] <- "high" next } else { cat(fp16_gflop, "is unknown\n") } } amazing GPU["Amazing"] <- amazing #Question 5 sorted_fp16_gflops <- sort(fp16_gflops) fp16_gflops_length <- length(fp16_gflops) #If the length of the sorted vector is divisble by 2... if ((fp16_gflops_length %% 2) == 0) { print("Dataset is even") #... create a vector of the 2 middle elements... fp16_gflops_medians <- c((fp16_gflops_length/2),((fp16_gflops_length/2)+1)) #... and calculate their average; that is the mean. fp16_gflops_median <- mean(sorted_fp16_gflops[fp16_gflops_medians]) # ^ This is a vector of the 2 middle spots in our even vector } else #< If the length of the sorted vector is odd... { print("Vector is odd") #Get the index of the median number by adding 1 to the total count, and divide by half. fp16_gflops_median_index <- (((fp16_gflops_length + 1)/2)) #The median is the number in the index we figured out earlier; pull it from the sorted vector. fp16_gflops_median <- sorted_fp16_gflops[fp16_gflops_median_index] } cat("Median is:", fp16_gflops_median) #Question 6 sampled_fp_32_gflops <- c() for (i in 1:3){ cat("On ", i, "\n") sampled_fp_32_gflop <- sample(chip$FP32.GFLOPS,1) while (sampled_fp_32_gflop < 0 | is.na(sampled_fp_32_gflop)) { cat("Sampled value ", sampled_fp_32_gflops, "is negative. Retrying...\n") sampled_fp_32_gflop <- sample(chip$FP32.GFLOPS,1) } sampled_fp_32_gflops[i] <- sampled_fp_32_gflop } pnorm(sampled_fp_32_gflops[1],mean = sampled_fp_32_gflops[2], sd = sqrt(sampled_fp_32_gflops[3])) #Question 7 fp64_gflops <- na.omit(GPU$FP64.GFLOPS) mean(fp64_gflops) var(fp64_gflops) zscore <- (fp64_gflops - mean(fp64_gflops)) / sd(fp64_gflops) #fp64_gflops_trans <- (fp64_gflops*2 + 16) zscore_lin_trans <- ( ( (1/sd(fp64_gflops) * 2000 ) * fp64_gflops ) - ( mean(fp64_gflops)/sd(fp64_gflops) ) ) # ^ THIS is the linear transformation. zscore_non_lin_trans <- ( ( (1/sd(fp64_gflops) * (fp64_gflops) ^ -0.7 ) * fp64_gflops ) - ( mean(fp64_gflops)/sd(fp64_gflops) ) ) plot(zscore_lin_trans,zscore_non_lin_trans,col = blue) #plot(zscore,zscore_lin_trans) doubled_zscore <- zscore * 2