Skip to content
Snippets Groups Projects
Commit eb421831 authored by Gallenkamp's avatar Gallenkamp
Browse files

initial

parents
Branches
No related tags found
No related merge requests found
.Rhistory
.RData
# citavi-export
Export pdf-content associated with doi from a citavi project.
# Function to automatically install required R-libraries
library2 <- function(x){
x = toString(substitute(x))
if(!require(x,character.only=TRUE)){
install.packages(x, repos = "http://cran.us.r-project.org")
base::library(x,character.only=TRUE)
}}
# Automatically install R-libraries
library2(RSQLite)
library2(readtext)
library2(purrr)
library2(stringr)
library2(tibble)
library2(readr)
library2(dplyr)
# Verbosly print tibbles for debbuging purposes
options(tibble.print_max = 5, tibble.print_min = 5,tibble.width=8000)
# Get the citavi project file path as user input
filepath <- ""
if (.Platform$OS.type == "windows")
filepath <- choose.files(default = "", caption = "Select the Citavi Project file as data source...", filters = cbind("Citavi Project V6 (*.ctv6)", "*.ctv6"))
# if a valid input is given
if (length(filepath) > 0 ){
# prepare citavi path for pdfs
citavi_attachments_dir <- paste0(dirname(filepath), "/", "Citavi Attachments", "/")
# Create an ephemeral in-memory RSQLite database referencing the citavi-project
con <- dbConnect(RSQLite::SQLite(), filepath)
# Get link between pdf_filepath, wos_doi and doi from tables
res <- dbSendQuery(con, "SELECT :citavi_attachments || json_extract(l.Address, '$.UriString') as pdf_filepath, substr( r.Notes,instr( r.Notes,'WOS:')+4) as wos_doi, r.DOI as doi
FROM Reference r LEFT OUTER JOIN Location l ON r.ID = l.ReferenceID and json_extract(l.Address, '$.LinkedResourceType') LIKE 1;", params = list(citavi_attachments = citavi_attachments_dir))
fetched <- dbFetch(res)
print("Fetched the Citavi Attachments and the corresponding doi.")
# create result dir if it not exists
result_dir <- file.path(dirname(filepath), "result")
if (!dir.exists(result_dir)){
dir.create(result_dir)
}
# function to retrieve text and store it in result/<id>.txt
pdf_to_text_file <- function(id, file_path) {
if (is.na(file_path) || str_length(file_path) <= 0){
return(FALSE)
}
# read pdf into string
#readtext_result <- readtext::readtext(file_path, encoding = "UTF-8")
# save string into file with filepath result/<id>.txt
#file_name <- paste0(dirname(filepath), "/result/", id, ".txt")
#readr::write_file(readtext_result$text, file_name)
return(TRUE)
}
# retrieve pdf content string and store it
tfetched <- as_tibble(fetched) %>%
# top_n(50,wos_doi) %>% # testing only!!
dplyr::mutate(text = purrr::map2_chr(wos_doi, pdf_filepath, ~pdf_to_text_file(.x, .y))) %>%
readr::write_csv2(paste0(dirname(filepath), "/result.csv"), append = FALSE, col_names = TRUE)
print(paste0("Exported extracted text in", dirname(filepath), "'result/<id>.txt' and 'result.csv'."))
dbClearResult(res)
# Disconnect from the database
dbDisconnect(con)
}
\ No newline at end of file
SET cwdrscript=%cd%
"C:\Program Files\R\R-3.5.3\bin\Rscript.exe" "%cwdrscript%\citavi_export.R"
pause
\ No newline at end of file
# Function to automatically install required R-libraries
library2 <- function(x){
x = toString(substitute(x))
if(!require(x,character.only=TRUE)){
install.packages(x, repos = "http://cran.us.r-project.org")
base::library(x,character.only=TRUE)
}}
# Automatically install R-libraries
library2(httr)
library2(purrr)
library2(stringr)
library2(tibble)
library2(readr)
library2(dplyr)
library2(jsonlite)
# Get the citavi project file path as user input
filepath <- ""
if (.Platform$OS.type == "windows")
filepath <- choose.files(default = "", caption = "Select the Citavi Project file as data source...", filters = cbind("Table with DOI as .csv", "*.csv"))
# Verbosly print tibbles for debbuging purposes
options(tibble.print_max = 5, tibble.print_min = 5,tibble.width=8000)
get_doi_referer <- function(text, doi) {
if (!text){
if (is.na(doi) || str_length(doi) <= 0){
return("Invalid DOI")
}
path <- paste0("https://doi.org/api/handles/", doi)
Sys.sleep(0.5)
response <- GET(url = path)
if (status_code(response) == 200){
json_reply <- content(response, as = "text", encoding = "UTF-8") %>% fromJSON
print(json_reply)
print(json_reply$responseCode)
if (json_reply$responseCode == 1){
doi_referer <- filter(as_tibble(json_reply$values), index == 1)$data$value[[1]]
return (doi_referer)
} else {
return(paste0("API-Error-Code: ", json_reply$responseCode))
}
} else {
return(paste0("HTTP-Error-Code: ", status_code(response)))
}
}
return("")
}
tfetched <- readr::read_delim(filepath,
delim = ";",
col_names=c("pdf_filepath","wos_doi", "doi", "text"),
col_types = "cccl",
skip=1) # %>%
tfetched %>%
# top_n(5,wos_doi) %>% # testing only!!
dplyr::mutate(doi_referer = purrr::map2_chr(text, doi, ~get_doi_referer(.x, .y))) %>%
readr::write_csv2(paste0(dirname(filepath), "/result_with_doi_lookup.csv"), append = FALSE, col_names = TRUE)
print(tfetched)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment