Skip to content
Snippets Groups Projects
Commit 83de5bda authored by Gallenkamp, Fabian's avatar Gallenkamp, Fabian
Browse files

Update exercise2.R

parent 2793fc7d
No related branches found
No related tags found
No related merge requests found
......@@ -8,6 +8,7 @@ setwd(dirname(rstudioapi::getSourceEditorContext()$path))
source("./exercise1.R")
scrape_regpk_links <- function(remote_driver) {
# select only links containing 'regierungspressekonferenz'
items <- remote_driver$findElements(using = "xpath", "//a[@class='bpa-teaser']") %>%
purrr::map(function(x){x$getElementAttribute('href')}) %>%
purrr::keep(function(x){str_detect(x, "regierungspressekonferenz")})
......@@ -24,22 +25,22 @@ rsdriver <- NULL
out <- tryCatch({
retrieved_items = list()
# create browser instance and set up communication with webdriver (select a chrome version according to your installation, refer also to above troubleshooting to see possible values)
rsdriver <- rsDriver(browser=c("chrome"), chromever="79.0.3945.36", port=8888L, )
# get webdriver
remote_driver <- rsdriver[["client"]]
# goto first search page
remote_driver$navigate("https://www.bundesregierung.de/breg-de/aktuelles/pressekonferenzen")
#randomized_sleep()
# wait some time
randomized_sleep()
# collect relevant search items on current page
new_items <- scrape_regpk_links(remote_driver)
retrieved_items <- retrieved_items %>% append(new_items)
i <- 0
max_pages = 0
while (TRUE) {
max_pages = 2
while (TRUE) { # press next unitil not possible or max pages reached
link_next_page <- remote_driver$findElement(value = '//li[@class = "bpa-pager-next"]/a')
......@@ -47,7 +48,7 @@ out <- tryCatch({
break
}
link_next_page$clickElement()
# wait some time again
randomized_sleep()
# collect relevant search items on current page
......@@ -59,16 +60,16 @@ out <- tryCatch({
# close browser, rest uses xml2 and rvest from previous exercise
remote_driver$close()
# scrape all regpks with links obtained above
retrieved_regpks = list()
for (item in retrieved_items){
print(item[[1]])
#print(item[[1]])
retrieved_regpks <- retrieved_regpks %>% append(list(scrape_regpk_current(item[[1]])))
}
# write result into json file
retrieved_regpks %>% toJSON( encoding="UTF-8", pretty = TRUE, auto_unbox=TRUE) %>% readr::write_file("./result.json")
},
error=function(cond) {
message("Here's the original error message:")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment