Parse libraries from R project

Having written a larger R project is may be of interest which packages have been used. As I did not find a read-to-use package, a colleague of mine - Norman Markgraf - came up with a nice solution. In this post, I build on his solution to provide a function that suits my needs of today:

@Norman: Thanks for your great idea!

First, some libraries:

library(tidyverse)
library(bibtex)
library(testthat)

Then, here is some path of an R project where we want to parse all rmd files:

path_project <- "/Users/sebastiansauer/Documents/Publikationen/In_Arbeit/Statistik__21/"

And here a sample rmd file from this folder:

file_test <- "112_NSE.Rmd"

All the source file names share this pattern in the following order; note the regex expression of each pattern element:

  • three digits \\d\\d\\d
  • underscore _
  • one word w+
  • file extension .rmd

So should be quite straight forward to regex it.

Within the files, we will search for all instances of “library(xxx)”, where “xxx” is the name of one library.

Now, let’s build a function that parses all libraries (as explained above):

parse_libraries <- function(file){
  # parses all libraries, ie., all strings as "library(ggplot2)"
  # returns vector of libraries parsed
  # input: source file
  
  libstring <- regex("library\\((\\w+)\\)", multiline = TRUE)
  file_to_parse <- read_lines(file)
  libs_raw <-  na.omit(str_extract(file_to_parse, libstring))
  libs <- str_remove(libs_raw, "library\\(")
  libs <- str_remove(libs, "\\)")
  return(libs)
}

Test:


path_file <- file.path(paste0(path_project, file_test))

parse_libraries(path_file)
#> [1] "rlang"     "pryr"      "tidyverse" "gridExtra" "ggplot2"   "ggplot2"  
#> [7] "ggplot2"

Looks good. Now cycle over all files in that folder:

parse_libraries_project <- function(folder, tidy = FALSE){
  # parses all .rmd files in a folder
  # returnes list of string vectors with libraries
  # input: folder name
  # if folder == "" doc_root <- "./"
  files <- list.files(folder, 
                      pattern="\\d\\d\\d_\\w+.Rmd")
   
  files %>% 
    map(~parse_libraries(file.path(paste0(folder, .)))) -> libs_project
  
  # some tidying, default is no tidying
  if (tidy) {
    libs_project <- unlist(libs_project) 
    libs_project <- sort(libs_project)
    libs_project <- unique(libs_project)
    
    if ("name_des_pakets" %in% libs_project) {
      libs_project[libs_project == "name_des_pakets"] <- NA}
    libs_project <- na.omit(libs_project)
  }
  
  return(libs_project)
}

Test it:

libs_in_project <- parse_libraries_project(path_project, tidy = TRUE)
libs_in_project
#>  [1] "AER"           "apaTables"     "BayesFactor"   "BaylorEdPsych"
#>  [5] "boot"          "bootES"        "broom"         "car"          
#>  [9] "caret"         "cluster"       "corrplot"      "corrr"        
#> [13] "cowplot"       "datasets"      "devtools"      "doMC"         
#> [17] "extrafont"     "formattable"   "GGally"        "ggcorrplot"   
#> [21] "ggmap"         "ggmosaic"      "ggplot2"       "ggrepel"      
#> [25] "ggthemes"      "googleVis"     "gplots"        "grid"         
#> [29] "gridExtra"     "huxtable"      "ISOcodes"      "kableExtra"   
#> [33] "knitr"         "komaletter"    "latex2exp"     "leaflet"      
#> [37] "likert"        "lsa"           "lsr"           "magrittr"     
#> [41] "MASS"          "methods"       "mice"          "modelr"       
#> [45] "mosaic"        "mvtnorm"       "nFactors"      "nycflights13" 
#> [49] "okcupiddata"   "pander"        "papaja"        "partykit"     
#> [53] "pdftools"      "plot3D"        "plotROC"       "png"          
#> [57] "prada"         "pradadata"     "pROC"          "pryr"         
#> [61] "psych"         "pwr"           "RColorBrewer"  "rlang"        
#> [65] "rpart"         "rworldmap"     "scales"        "SDMTools"     
#> [69] "sf"            "sjmisc"        "skimr"         "SnowballC"    
#> [73] "stargazer"     "stringr"       "tidyr"         "tidytext"     
#> [77] "tidyverse"     "titanic"       "twitteR"       "VIM"          
#> [81] "viridis"       "wesanderson"   "wordcloud"     "xtable"       
#> [85] "yart"         
#> attr(,"na.action")
#> [1] 47
#> attr(,"class")
#> [1] "omit"
expect_true("tidyverse" %in% libs_in_project)
expect_false("name_des_pakets" %in% libs_in_project)
length(libs_in_project)
#> [1] 85

Software should be cited, so let’s get the citation of each package. Unfortunately, no id (BibTeX key) for the record is rendered. So let’s try to build one for each record. Fortunately, there’s a function for that:

write.bib(libs_in_project)

Finally, let’s create a text that cites all these packages as in-text citations so that the functions will appear in the bibliography. We will use the pandoc citation format, which is “@id”.

format_cite_string <- function(libs){
  libs %>% 
    map(~str_glue("@{.}, ", sep = "")) %>% 
    unlist() %>% 
    paste0(., collapse = "")
}
format_cite_string(libs_in_project)
#> [1] "@AER, @apaTables, @BayesFactor, @BaylorEdPsych, @boot, @bootES, @broom, @car, @caret, @cluster, @corrplot, @corrr, @cowplot, @datasets, @devtools, @doMC, @extrafont, @formattable, @GGally, @ggcorrplot, @ggmap, @ggmosaic, @ggplot2, @ggrepel, @ggthemes, @googleVis, @gplots, @grid, @gridExtra, @huxtable, @ISOcodes, @kableExtra, @knitr, @komaletter, @latex2exp, @leaflet, @likert, @lsa, @lsr, @magrittr, @MASS, @methods, @mice, @modelr, @mosaic, @mvtnorm, @nFactors, @nycflights13, @okcupiddata, @pander, @papaja, @partykit, @pdftools, @plot3D, @plotROC, @png, @prada, @pradadata, @pROC, @pryr, @psych, @pwr, @RColorBrewer, @rlang, @rpart, @rworldmap, @scales, @SDMTools, @sf, @sjmisc, @skimr, @SnowballC, @stargazer, @stringr, @tidyr, @tidytext, @tidyverse, @titanic, @twitteR, @VIM, @viridis, @wesanderson, @wordcloud, @xtable, @yart, "

We (or I at the least) can use this string to cite all the R packages used, something like this “The following R packages were used in this project: format_cite_string(libs_in_project)”.