Skip to content
Snippets Groups Projects
Commit 83de5bda authored by Gallenkamp, Fabian's avatar Gallenkamp, Fabian
Browse files

Update exercise2.R

parent 2793fc7d
Branches
Tags
No related merge requests found
...@@ -8,6 +8,7 @@ setwd(dirname(rstudioapi::getSourceEditorContext()$path)) ...@@ -8,6 +8,7 @@ setwd(dirname(rstudioapi::getSourceEditorContext()$path))
source("./exercise1.R") source("./exercise1.R")
scrape_regpk_links <- function(remote_driver) { scrape_regpk_links <- function(remote_driver) {
# select only links containing 'regierungspressekonferenz'
items <- remote_driver$findElements(using = "xpath", "//a[@class='bpa-teaser']") %>% items <- remote_driver$findElements(using = "xpath", "//a[@class='bpa-teaser']") %>%
purrr::map(function(x){x$getElementAttribute('href')}) %>% purrr::map(function(x){x$getElementAttribute('href')}) %>%
purrr::keep(function(x){str_detect(x, "regierungspressekonferenz")}) purrr::keep(function(x){str_detect(x, "regierungspressekonferenz")})
...@@ -24,22 +25,22 @@ rsdriver <- NULL ...@@ -24,22 +25,22 @@ rsdriver <- NULL
out <- tryCatch({ out <- tryCatch({
retrieved_items = list() retrieved_items = list()
# create browser instance and set up communication with webdriver (select a chrome version according to your installation, refer also to above troubleshooting to see possible values)
rsdriver <- rsDriver(browser=c("chrome"), chromever="79.0.3945.36", port=8888L, ) rsdriver <- rsDriver(browser=c("chrome"), chromever="79.0.3945.36", port=8888L, )
# get webdriver
remote_driver <- rsdriver[["client"]] remote_driver <- rsdriver[["client"]]
# goto first search page
remote_driver$navigate("https://www.bundesregierung.de/breg-de/aktuelles/pressekonferenzen") remote_driver$navigate("https://www.bundesregierung.de/breg-de/aktuelles/pressekonferenzen")
# wait some time
#randomized_sleep() randomized_sleep()
# collect relevant search items on current page # collect relevant search items on current page
new_items <- scrape_regpk_links(remote_driver) new_items <- scrape_regpk_links(remote_driver)
retrieved_items <- retrieved_items %>% append(new_items) retrieved_items <- retrieved_items %>% append(new_items)
i <- 0 i <- 0
max_pages = 0 max_pages = 2
while (TRUE) { while (TRUE) { # press next unitil not possible or max pages reached
link_next_page <- remote_driver$findElement(value = '//li[@class = "bpa-pager-next"]/a') link_next_page <- remote_driver$findElement(value = '//li[@class = "bpa-pager-next"]/a')
...@@ -47,7 +48,7 @@ out <- tryCatch({ ...@@ -47,7 +48,7 @@ out <- tryCatch({
break break
} }
link_next_page$clickElement() link_next_page$clickElement()
# wait some time again
randomized_sleep() randomized_sleep()
# collect relevant search items on current page # collect relevant search items on current page
...@@ -59,16 +60,16 @@ out <- tryCatch({ ...@@ -59,16 +60,16 @@ out <- tryCatch({
# close browser, rest uses xml2 and rvest from previous exercise # close browser, rest uses xml2 and rvest from previous exercise
remote_driver$close() remote_driver$close()
# scrape all regpks with links obtained above
retrieved_regpks = list() retrieved_regpks = list()
for (item in retrieved_items){ for (item in retrieved_items){
print(item[[1]]) #print(item[[1]])
retrieved_regpks <- retrieved_regpks %>% append(list(scrape_regpk_current(item[[1]]))) retrieved_regpks <- retrieved_regpks %>% append(list(scrape_regpk_current(item[[1]])))
} }
# write result into json file
retrieved_regpks %>% toJSON( encoding="UTF-8", pretty = TRUE, auto_unbox=TRUE) %>% readr::write_file("./result.json") retrieved_regpks %>% toJSON( encoding="UTF-8", pretty = TRUE, auto_unbox=TRUE) %>% readr::write_file("./result.json")
}, },
error=function(cond) { error=function(cond) {
message("Here's the original error message:") message("Here's the original error message:")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment