diff --git a/R_Final_Tasks_Statistics.R b/R_Final_Tasks_Statistics.R new file mode 100755 index 0000000..7d83152 --- /dev/null +++ b/R_Final_Tasks_Statistics.R @@ -0,0 +1,38 @@ +##Final R assignment in Intro to Statistics course, fall semster. +#+Written by Matan Horovitz (207130253) and Guy Amzaleg () +#+We have chosen a dataset of the first 600,000 commits to the Linux Kernel Git repository - as published on Kaggle: +#+https://www.kaggle.com/datasets/philschmidt/linux-kernel-git-revision-history +#+This dataset examines commits made to the Linux kernel project over the last 12 years. +raw_kernel_commits <- read.csv("/home/shmick/linux_kernel_git_revlog.csv") +##BONUS: convert from EPOCH: as.Date(as.POSIXct(1100171890,origin = "1970-01-01")) +View(raw_kernel_commits) +##For question 1, we have chosen to examine whether the amount of changes (additions and deletions) vary over time. +##A larger amount of changes per commit is correlated with a decrease in the quality of the code - as more changes +#+are harder to track and audit, while a smaller amount of changes per commit implies stricter coding standards +#+over time. +##To examine the correlation, we made a subset of the dataset, discarding data which is irrelevant to our question, +#+and summed all changes from the same commits into an aggregate - as we are not concerned about the files changed, +#+only the total amount of changes on each individual commit. + +kernel_commits <- subset(raw_kernel_commits, select= c(author_timestamp,n_additions,n_deletions)) +# Make a subset ^ of the dataset which includes ^ the EPOCH time,^ additions ^ and deletions. +#Rename the resulting dataset into friendlier column names +colnames(kernel_commits) <- c("EPOCH","Additions","Deletions") +#Sum additions and deletion into a new column, named "Total changes" +kernel_commits["Total changes"] <- kernel_commits["Additions"]+kernel_commits["Deletions"] +View(kernel_commits) #< examine the resulting dataset +#Unite all commits with same EPOCH to get changes PER COMMIT +kernel_commits_sum <- aggregate(. ~ EPOCH,data=kernel_commits,FUN=sum) + +View(kernel_commits_sum) +#Measure the correlation between EPOCH time (from oldest to newest) and the total number of changes per commit. +cor(kernel_commits_sum["EPOCH"],kernel_commits_sum["Total changes"]) + +##QUESTION 2: measure number of columns in our dataset and calculate a permutation and combination of +#+that number, minus two, and 3. + +#Calculate total number of columns in our dataset +n <- ncol(kernel_commits) +View(n) + +##QUESTION 3: pick two categorial variables - month (?), is documentation \ No newline at end of file