Skip to content
Snippets Groups Projects
Commit eb421831 authored by Gallenkamp's avatar Gallenkamp
Browse files

initial

parents
No related branches found
No related tags found
No related merge requests found
.Rhistory
.RData
# citavi-export
Export pdf-content associated with doi from a citavi project.
# Function to automatically install required R-libraries
library2 <- function(x){
x = toString(substitute(x))
if(!require(x,character.only=TRUE)){
install.packages(x, repos = "http://cran.us.r-project.org")
base::library(x,character.only=TRUE)
}}
# Automatically install R-libraries
library2(RSQLite)
library2(readtext)
library2(purrr)
library2(stringr)
library2(tibble)
library2(readr)
library2(dplyr)
# Verbosly print tibbles for debbuging purposes
options(tibble.print_max = 5, tibble.print_min = 5,tibble.width=8000)
# Get the citavi project file path as user input
filepath <- ""
if (.Platform$OS.type == "windows")
filepath <- choose.files(default = "", caption = "Select the Citavi Project file as data source...", filters = cbind("Citavi Project V6 (*.ctv6)", "*.ctv6"))
# if a valid input is given
if (length(filepath) > 0 ){
# prepare citavi path for pdfs
citavi_attachments_dir <- paste0(dirname(filepath), "/", "Citavi Attachments", "/")
# Create an ephemeral in-memory RSQLite database referencing the citavi-project
con <- dbConnect(RSQLite::SQLite(), filepath)
# Get link between pdf_filepath, wos_doi and doi from tables
res <- dbSendQuery(con, "SELECT :citavi_attachments || json_extract(l.Address, '$.UriString') as pdf_filepath, substr( r.Notes,instr( r.Notes,'WOS:')+4) as wos_doi, r.DOI as doi
FROM Reference r LEFT OUTER JOIN Location l ON r.ID = l.ReferenceID and json_extract(l.Address, '$.LinkedResourceType') LIKE 1;", params = list(citavi_attachments = citavi_attachments_dir))
fetched <- dbFetch(res)
print("Fetched the Citavi Attachments and the corresponding doi.")
# create result dir if it not exists
result_dir <- file.path(dirname(filepath), "result")
if (!dir.exists(result_dir)){
dir.create(result_dir)
}
# function to retrieve text and store it in result/<id>.txt
pdf_to_text_file <- function(id, file_path) {
if (is.na(file_path) || str_length(file_path) <= 0){
return(FALSE)
}
# read pdf into string
#readtext_result <- readtext::readtext(file_path, encoding = "UTF-8")
# save string into file with filepath result/<id>.txt
#file_name <- paste0(dirname(filepath), "/result/", id, ".txt")
#readr::write_file(readtext_result$text, file_name)
return(TRUE)
}
# retrieve pdf content string and store it
tfetched <- as_tibble(fetched) %>%
# top_n(50,wos_doi) %>% # testing only!!
dplyr::mutate(text = purrr::map2_chr(wos_doi, pdf_filepath, ~pdf_to_text_file(.x, .y))) %>%
readr::write_csv2(paste0(dirname(filepath), "/result.csv"), append = FALSE, col_names = TRUE)
print(paste0("Exported extracted text in", dirname(filepath), "'result/<id>.txt' and 'result.csv'."))
dbClearResult(res)
# Disconnect from the database
dbDisconnect(con)
}
\ No newline at end of file
SET cwdrscript=%cd%
"C:\Program Files\R\R-3.5.3\bin\Rscript.exe" "%cwdrscript%\citavi_export.R"
pause
\ No newline at end of file
# Function to automatically install required R-libraries
library2 <- function(x){
x = toString(substitute(x))
if(!require(x,character.only=TRUE)){
install.packages(x, repos = "http://cran.us.r-project.org")
base::library(x,character.only=TRUE)
}}
# Automatically install R-libraries
library2(httr)
library2(purrr)
library2(stringr)
library2(tibble)
library2(readr)
library2(dplyr)
library2(jsonlite)
# Get the citavi project file path as user input
filepath <- ""
if (.Platform$OS.type == "windows")
filepath <- choose.files(default = "", caption = "Select the Citavi Project file as data source...", filters = cbind("Table with DOI as .csv", "*.csv"))
# Verbosly print tibbles for debbuging purposes
options(tibble.print_max = 5, tibble.print_min = 5,tibble.width=8000)
get_doi_referer <- function(text, doi) {
if (!text){
if (is.na(doi) || str_length(doi) <= 0){
return("Invalid DOI")
}
path <- paste0("https://doi.org/api/handles/", doi)
Sys.sleep(0.5)
response <- GET(url = path)
if (status_code(response) == 200){
json_reply <- content(response, as = "text", encoding = "UTF-8") %>% fromJSON
print(json_reply)
print(json_reply$responseCode)
if (json_reply$responseCode == 1){
doi_referer <- filter(as_tibble(json_reply$values), index == 1)$data$value[[1]]
return (doi_referer)
} else {
return(paste0("API-Error-Code: ", json_reply$responseCode))
}
} else {
return(paste0("HTTP-Error-Code: ", status_code(response)))
}
}
return("")
}
tfetched <- readr::read_delim(filepath,
delim = ";",
col_names=c("pdf_filepath","wos_doi", "doi", "text"),
col_types = "cccl",
skip=1) # %>%
tfetched %>%
# top_n(5,wos_doi) %>% # testing only!!
dplyr::mutate(doi_referer = purrr::map2_chr(text, doi, ~get_doi_referer(.x, .y))) %>%
readr::write_csv2(paste0(dirname(filepath), "/result_with_doi_lookup.csv"), append = FALSE, col_names = TRUE)
print(tfetched)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment