From 83de5bda2b857c0f1503f16bc8fa53e60c7172e2 Mon Sep 17 00:00:00 2001 From: "Gallenkamp, Fabian (bax1489)" <476-bax1489@users.noreply.gitlab.rrz.uni-hamburg.de> Date: Mon, 27 Jan 2020 09:56:06 +0100 Subject: [PATCH] Update exercise2.R --- exercise2.R | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/exercise2.R b/exercise2.R index 6e62182..bf9cf10 100644 --- a/exercise2.R +++ b/exercise2.R @@ -8,6 +8,7 @@ setwd(dirname(rstudioapi::getSourceEditorContext()$path)) source("./exercise1.R") scrape_regpk_links <- function(remote_driver) { + # select only links containing 'regierungspressekonferenz' items <- remote_driver$findElements(using = "xpath", "//a[@class='bpa-teaser']") %>% purrr::map(function(x){x$getElementAttribute('href')}) %>% purrr::keep(function(x){str_detect(x, "regierungspressekonferenz")}) @@ -24,22 +25,22 @@ rsdriver <- NULL out <- tryCatch({ retrieved_items = list() - + # create browser instance and set up communication with webdriver (select a chrome version according to your installation, refer also to above troubleshooting to see possible values) rsdriver <- rsDriver(browser=c("chrome"), chromever="79.0.3945.36", port=8888L, ) - + # get webdriver remote_driver <- rsdriver[["client"]] - + # goto first search page remote_driver$navigate("https://www.bundesregierung.de/breg-de/aktuelles/pressekonferenzen") - - #randomized_sleep() + # wait some time + randomized_sleep() # collect relevant search items on current page new_items <- scrape_regpk_links(remote_driver) retrieved_items <- retrieved_items %>% append(new_items) i <- 0 - max_pages = 0 - while (TRUE) { + max_pages = 2 + while (TRUE) { # press next unitil not possible or max pages reached link_next_page <- remote_driver$findElement(value = '//li[@class = "bpa-pager-next"]/a') @@ -47,7 +48,7 @@ out <- tryCatch({ break } link_next_page$clickElement() - + # wait some time again randomized_sleep() # collect relevant search items on current page @@ -59,16 +60,16 @@ out <- tryCatch({ # close browser, rest uses xml2 and rvest from previous exercise remote_driver$close() + # scrape all regpks with links obtained above retrieved_regpks = list() for (item in retrieved_items){ - print(item[[1]]) + #print(item[[1]]) retrieved_regpks <- retrieved_regpks %>% append(list(scrape_regpk_current(item[[1]]))) } + # write result into json file retrieved_regpks %>% toJSON( encoding="UTF-8", pretty = TRUE, auto_unbox=TRUE) %>% readr::write_file("./result.json") - - }, error=function(cond) { message("Here's the original error message:") -- GitLab