Add loss function practicve

Merde 2
Merde 1
2024-01-10 15:16:52 +02:00 · 2023-05-26 00:12:11 +03:00 · 2023-03-29 12:01:55 +03:00 · 2023-03-06 21:36:23 +02:00 · 2023-03-06 21:35:59 +02:00 · 2023-03-06 15:15:02 +02:00
6 changed files with 360 additions and 0 deletions
--- a/R_Final_Tasks_Statistics.R
+++ b/R_Final_Tasks_Statistics.R
@@ -0,0 +1,160 @@
+##Final R assignment in Intro to Statistics course, fall semster.
+#+Written by Matan Horovitz (207130253) and Guy Amzaleg ()
+#+We have chosen a dataset of CPU and GPU performance trends since 2000 - as published on Kaggle:
+#+https://www.kaggle.com/datasets/michaelbryantds/cpu-and-gpu-product-data
+
+chip <- read.csv("/home/shmick/Downloads/chip_dataset.csv")
+#chip <- na.omit(chip)
+##BONUS: convert from EPOCH: as.Date(as.POSIXct(1100171890,origin = "1970-01-01"))
+#View(chip)
+##For question 1, we have chosen to examine which type of chip has examined the greater improvement over the years - GPU chips or CPU chips.
+#+As chip performance is most directly correlated with the number of transistors, we have measured the pace of development based on pace of 
+#+increasing transistor count.
+CPU <- chip[chip$Type == 'CPU',]
+#CPU <- subset(CPU, select= c(Product,Type,Release.Date,Process.Size..nm.,TDP..W.,Die.Size..mm.2.,Transistors..million.,Freq..MHz.))
+GPU <- chip[chip$Type == 'GPU',]
+#GPU <- subset(GPU, select= c(Product,Type,Release.Date,Process.Size..nm.,TDP..W.,Die.Size..mm.2.,Transistors..million.,Freq..MHz.))
+#Calculate a crude 'performance factor' - the number of transistors multiplied by their frequency.
+#CPU["Performance Factor"])
+#Range of total transistor advancement
+max(CPU$Transistors..million.,na.rm=TRUE) - min(CPU$Transistors..million.,na.rm=TRUE)
+max(GPU$Transistors..million.,na.rm=TRUE) - min(GPU$Transistors..million.,na.rm=TRUE)
+#Omit chips with missing data
+#CPU <- na.omit(CPU)
+#GPU <- na.omit(GPU)
+##Iterate over date entries
+#for (i in 1:length(CPU$Release.Date)){print(i)}
+##Get date
+##Install the 'lubridate' package to deal with conversion to EPOCH time
+#install.packages('lubridate')
+#library(lubridate)
+#dates <- strptime(CPU$Release.Date,format="%Y-%m-%d")
+#as.integer(as.POSIXct(CPU$Release.Date))
+#posix_format_date <- c()
+#or (date in 1:length(CPU$Release.Date)){
+#  cat("Date is", date)
+#  human_format_date <- CPU$Release.Date[date]
+#  print(human_format_date)
+#  posix_format_date[date] <- strptime(human_format_date,format="%Y-%m-%d")
+#}
+#for (i in CPU$Release.Date){
+#  print(i)
+#}
+
+
+##QUESTION 2: measure number of columns in our dataset and calculate a permutation and combination of 
+#+that number, minus two, and 3.
+
+#Calculate total number of columns in our dataset
+#n <- ncol(kernel_commits)
+#View(n)
+
+##QUESTION 3: pick two categorcial variables (Chip type, foundry) and see whether they're dependent
+#+1. Probablity of chip type
+#+2. Probability of foundry
+#+3. Multiplty
+
+#Sample 1 variable from 'Type' column
+sampled_type <- sample(chip$Type,1)
+#Count how many times it appears in it's column
+p_sampled_type <- (length(which(chip$Type==sampled_type)))/length(chip$Type)
+sampled_foundry <- sample(chip$Foundry,1)
+p_sampled_foundry <- (length(which(chip$Foundry==sampled_foundry)))/length(chip$Foundry)
+sampled_type_matrix <- chip[chip$Type == sampled_type,]
+p_sampled_foundry_in_sampled_type <- (length(which(sampled_type_matrix$Foundry==sampled_foundry)))/length(sampled_type_matrix$Foundry)
+p_sampled_chip_and_foundry <- p_sampled_foundry_in_sampled_type * p_sampled_type
+
+if (p_sampled_chip_and_foundry == (p_sampled_type * p_sampled_foundry)){
+  print("Independent")
+}else{
+  print("Dependent")
+}
+
+#Question 4 - 'Amazing'
+GPU <- na.omit(GPU)
+fp16_gflops <- na.omit(GPU$FP16.GFLOPS)
+#Get total range of FP.16 GFLOPS
+fp16_range <- as.numeric(sprintf("%.2f",(max(GPU$FP16.GFLOPS,na.rm=TRUE))-min(GPU$FP16.GFLOPS,na.rm=TRUE)))
+fp16_low_threshold <- fp16_range / 3
+fp16_medium_threshold <- fp16_low_threshold *2
+#Create empty vector named 'amazing'
+amazing <-c()
+#Iterate over all numbers from 1 to the length of the vector
+for (i in 1:length(fp16_gflops))
+{
+  fp16_gflop <- fp16_gflops[i]
+  #If the number is greater or equal to 1 AND ALSO (&) smaller or equal to 3...
+  if(fp16_gflop <= fp16_low_threshold)
+    #    ^ this bit is important
+  {
+    cat(fp16_gflop, "is low\n")
+    #Add "low" to list called 'Amazing'
+    amazing[i] <- "low"
+    #Once this condition is satisfied, move on to next item in loop (if on 1, move on to 2, etc)
+    next
+  }
+  #If the number is greater or equal to 3 AND ALSO (&) smaller or equal to 6...
+  else if(fp16_gflop > fp16_low_threshold & fp16_gflop <= fp16_medium_threshold  )
+    #             ^ this is like two IF's
+  {
+    cat(fp16_gflop, "is medium\n")
+    amazing[i] <- "medium"
+    next
+  } else if(fp16_gflop > fp16_medium_threshold) {
+    cat(fp16_gflop, "is high\n")
+    amazing[i] <- "high"
+    next 
+  } else {
+  cat(fp16_gflop, "is unknown\n")
+  }
+}
+amazing
+GPU["Amazing"] <- amazing
+#Question 5
+sorted_fp16_gflops <- sort(fp16_gflops)
+fp16_gflops_length <- length(fp16_gflops)
+#If the length of the sorted vector is divisble by 2...
+if ((fp16_gflops_length %% 2) == 0) {
+  print("Dataset is even")
+  #... create a vector of the 2 middle elements...
+  fp16_gflops_medians <- c((fp16_gflops_length/2),((fp16_gflops_length/2)+1))
+  #... and calculate their average; that is the mean.
+  fp16_gflops_median <- mean(sorted_fp16_gflops[fp16_gflops_medians])
+  #                  ^ This is a vector of the 2 middle spots in our even vector
+} else #< If the length of the sorted vector is odd...
+{
+  print("Vector is odd")
+  #Get the index of the median number by adding 1 to the total count, and divide by half.
+  fp16_gflops_median_index <- (((fp16_gflops_length + 1)/2))
+  #The median is the number in the index we figured out earlier; pull it from the sorted vector.
+  fp16_gflops_median <- sorted_fp16_gflops[fp16_gflops_median_index]
+}
+cat("Median is:", fp16_gflops_median)
+#Question 6
+sampled_fp_32_gflops <- c()
+for (i in 1:3){
+  cat("On ", i, "\n")
+  sampled_fp_32_gflop <- sample(chip$FP32.GFLOPS,1)
+  while (sampled_fp_32_gflop < 0 | is.na(sampled_fp_32_gflop))
+  {
+    cat("Sampled value ", sampled_fp_32_gflops, "is negative. Retrying...\n")
+    sampled_fp_32_gflop <- sample(chip$FP32.GFLOPS,1)
+  } 
+  
+  sampled_fp_32_gflops[i] <- sampled_fp_32_gflop
+}
+pnorm(sampled_fp_32_gflops[1],mean = sampled_fp_32_gflops[2], sd = sqrt(sampled_fp_32_gflops[3]))
+
+#Question 7 
+fp64_gflops <- na.omit(GPU$FP64.GFLOPS)
+mean(fp64_gflops)
+var(fp64_gflops)
+zscore <- (fp64_gflops - mean(fp64_gflops)) / sd(fp64_gflops)
+#fp64_gflops_trans <- (fp64_gflops*2 + 16)
+zscore_lin_trans <- ( ( (1/sd(fp64_gflops) * 2000 ) * fp64_gflops ) - ( mean(fp64_gflops)/sd(fp64_gflops) ) )
+#                                          ^ THIS is the linear transformation.
+zscore_non_lin_trans <- ( ( (1/sd(fp64_gflops) * (fp64_gflops) ^ -0.7 ) * fp64_gflops ) - ( mean(fp64_gflops)/sd(fp64_gflops) ) )
+
+plot(zscore_lin_trans,zscore_non_lin_trans,col = blue)
+#plot(zscore,zscore_lin_trans)
+doubled_zscore <- zscore * 2
--- a/amazing.R
+++ b/amazing.R
@@ -0,0 +1,34 @@
+#Create vector from 1 to 10
+num <- c(seq(1:10))
+#Create empty vector named 'amazing'
+amazing <-c()
+#Iterate over all numbers from 1 to the length of the vector (10)
+for (i in 1:length(num))
+{
+  #If the number is greater or equal to 1 AND ALSO (&) smaller or equal to 3...
+  if(i >= 1 & i <=3)
+  #    ^ this bit is important
+  {
+    cat(i, "is low\n")
+    #Add "low" to list called 'Amazing'
+    amazing[i] <- "low"
+    #Once this condition is satisfied, move on to next item in loop (if on 1, move on to 2, etc)
+    next
+  }
+  #If the number is greater or equal to 3 AND ALSO (&) smaller or equal to 6...
+  else if(i > 3 & i <=6)
+  #             ^ this is like two IF's
+  {
+    cat(i, "is medium\n")
+    amazing[i] <- "medium"
+    next
+  }
+  
+  else if(i > 6 & i <=10)
+  #                 ^ this bit is also important
+    cat(i, "is high\n")
+    amazing[i] <- "high"
+    next
+}
+amazing
+sleep["Amazing"] <- amazing
--- a/loss.R
+++ b/loss.R
@@ -0,0 +1,86 @@
+# Install datawizard package (for calculating mode with 'distribution_mode()')
+
+if (!require(datawizard)) {
+  install.packages("datawizard")
+  library(datawizard)
+}
+
+
+# Define the Salaries vector
+Salaries <- c(12567, 15400, 11345, 13130, 12567, 12812, 14908)
+
+
+# Calculate the mode
+modeS <- distribution_mode(Salaries)
+modeS
+
+# Calculate the median
+meadianS <- median(Salaries)
+meadianS
+
+# Calculate the mean
+meanS <- mean(Salaries)
+meanS
+
+
+
+
+# The first loss function ---------------------
+## For each value in vector "Salaries", check whether it is:
+#+ 1. different from the mode
+#+ 2. different from the median
+#+ 3. different from the mean
+#+ ...And sum up the results (the "True" results)
+
+# How many values are different from the mode? 
+# (how many people make something other than the mode?)
+sum(Salaries != modeS)
+# How many people make something other than the median?
+sum(Salaries != meadianS)
+# Etc
+sum(Salaries != meanS)
+
+
+
+
+# Second loss function ----------------------------
+  sum(             abs                 (Salaries - modeS))
+# ^ summarize the  ^ absolute value of  ^ each salary minus the modal salary
+sum(abs(Salaries - meadianS))
+sum(abs(Salaries - meanS))
+
+
+
+# Third loss function ----------------------------
+sum((Salaries - modeS)^2)
+sum((Salaries - meadianS)^2)
+sum((Salaries - meanS)^2)
+
+#..........................................Exercise 2
+#Create a vector of numbers
+potato <- runif(n=20, min=1, max=20)
+
+#Calculate the mean, median and mod of the vector of numbers you created
+
+potato_mean <- mean(potato)
+potato_mean
+potato_median <- median(potato)
+potato_median
+potato_mode=distribution_mode(potato)
+potato_mode
+# print results
+sprintf("Mean %s Median %s Mode %s", potato_mean, potato_median, potato_mode)
+
+#Calculate the error for each loss function for each of the
+#measures you calculated (the mean, median and mod)
+
+# 1st loss function
+sum(potato != potato_mode)
+
+# 2nd loss function
+
+sum(abs(potato - potato_median))
+
+# 3rd loss function
+
+sum((potato - potato_mean)^2)
--- a/mean.R
+++ b/mean.R
@@ -0,0 +1,23 @@
+#Create two vectors with random, normal distribution numbers: one with even, and another with odd number of elements.
+bunch_of_nums_even <- rnorm(100)
+bunch_of_nums_odd <-  rnorm(101)
+#Sort one of the vectors according to number size
+sorted_nums <- sort(bunch_of_nums_even) #< I used even for this example - switch to odd to test that as well. 
+num_length <- length(sorted_nums)
+#If the length of the sorted vector is divisble by 2...
+if ((num_length %% 2) == 0) {
+  print("Vector is even")
+  #... create a vector of the 2 middle elements...
+  num_medians <- c((num_length/2),((num_length/2)+1))
+  #... and calculate their average; that is the mean.
+  num_median <- mean(sorted_nums[num_medians])
+  #                  ^ This is a vector of the 2 middle spots in our even vector
+} else #< If the length of the sorted vector is odd...
+{
+  print("Vector is odd")
+  #Get the index of the median number by substracting 1 from the total count, divide by half and add one again.
+  num_median_index <- (((num_length + 1)/2)) #<There's probably a better way to do this.
+  #The median is the number in the index we figured out earlier; pull it from the sorted vector.
+  num_median <- sorted_nums[num_median_index]
+}
+cat("Median is:", num_median)
--- a/merde_1.R
+++ b/merde_1.R
@@ -0,0 +1,26 @@
+#Question 4
+mean_1 = 50
+n_1 = 36
+std_1 = 42
+#a_1 = 0.05 NOPE
+a_1 = 1.96
+lower_1 = mean_1 - (std_1/sqrt(n_1))*a_1
+upper_1 = mean_1 + (std_1/sqrt(n_1))*a_1
+
+cat(lower_1,"-",mean_1,"",upper_1)
+#Question 5
+## Higher confidence -> lower accuracy -> less accurate estimate -> greater range
+## ~~Bigger n -> greater range -> greater standard deviation -> ?~~ NOPE
+## Bigger n -> smaller standard error (std_1/sqrt(n_1)) -> smaller confidence range 
+
+#Question 6
+
+std_2 = 15
+e_2 = 10 #?
+# 10 = 15/sqrt(n) -> sqrt(n) = 15/10 -> n_2 = 1.5 #nope
+#1.96*stderr_2 = e_2 (10)
+#stderr_2 = 5.102041
+#n_2: 5.102041 = 15/sqrt(n)
+#15/5.102041 = sqrt(n)
+# 10 = 1.96*(15/sqrt(n))
+
--- a/merde_2.R
+++ b/merde_2.R
@@ -0,0 +1,31 @@
+n=36
+x_init=0.46 #<proportion - p - parameter
+x_finit=0.38
+#---Step 1 - define the hypothesis
+#H0: p = 0.46
+#H1: p!= 0.46
+#---Step 2 - define the assumptions and distribution shape
+#1. Assume normal distribution
+#2. Assume random, unbiased sampling
+#3. p`= p = 0.46 -  H0 is assumed true
+#4. stderr of p` = sqrt((q*p)/n)
+
+#---Step 3 - determine where a (alpha) is, define criterion
+a=0.02 #0.04/2 Assumption: H1 (double tail)
+Zc=2.05 #<given
+#stderr=sqrt(((0.46-0.38)^2)/35) #<what is this misery?
+#^this is nonsense
+
+#---Step 4 - ???
+#stderr=sqrt(((0.46-0.38)^2)/35) #<Is it this thing? it it this bastard?
+#Zp`^
+Zptag=(0.38-0.46)#/@p`
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+a=0.05
+x=5
+s=1.5
+n=15
+s2<s
+#Is s` the same as s in this case?
+#s` pushes upwards - this is a rear tail - possibly different result! (?)
+#What on earth is a t test?
Author	SHA1	Message	Date
Matan Horovitz	52a300cd77	Add loss function practicve	2024-01-10 15:16:52 +02:00
Matan Horovitz	a27137a2b1	Merde 2	2023-05-26 00:12:11 +03:00
Matan Horovitz	9a0cf35684	Merde 1	2023-03-29 12:01:55 +03:00
Matan Horovitz	193143a79f	Why not?	2023-03-06 21:36:23 +02:00
Matan Horovitz	e7f055c670	Done!	2023-03-06 21:35:59 +02:00
Matan Horovitz	d2cc765ed1	Digging into question 3	2023-03-06 15:15:02 +02:00
Matan Horovitz	2d44b377fb	Question 1, kind of	2023-02-26 18:25:07 +02:00
Matan Horovitz	6bd14b5476	Second attempt	2023-02-26 17:50:25 +02:00
Matan Horovitz	a40330b3ac	First (failed) attempt - R final statistic project (sem A)	2023-02-24 21:55:44 +02:00