Update exercise2.R

83de5bda · Gallenkamp, Fabian · 2793fc7d · 83de5bda
Commit 83de5bda authored Jan 27, 2020 by Gallenkamp, Fabian
--- a/exercise2.R
+++ b/exercise2.R
@@ -8,6 +8,7 @@ setwd(dirname(rstudioapi::getSourceEditorContext()$path))
 source("./exercise1.R")

 scrape_regpk_links <- function(remote_driver) {
+  # select only links containing 'regierungspressekonferenz'
  items <- remote_driver$findElements(using = "xpath", "//a[@class='bpa-teaser']") %>%
    purrr::map(function(x){x$getElementAttribute('href')}) %>%
    purrr::keep(function(x){str_detect(x, "regierungspressekonferenz")})
@@ -24,22 +25,22 @@ rsdriver <- NULL
 out <- tryCatch({
  
  retrieved_items = list()
-  
+  # create browser instance and set up communication with webdriver (select a chrome version according to your installation, refer also to above troubleshooting to see possible values) 
  rsdriver <- rsDriver(browser=c("chrome"), chromever="79.0.3945.36",  port=8888L, )
-
+  # get webdriver
  remote_driver <- rsdriver[["client"]]
-
+  # goto first search page
  remote_driver$navigate("https://www.bundesregierung.de/breg-de/aktuelles/pressekonferenzen")
-  
-  #randomized_sleep()
+  # wait some time
+  randomized_sleep()

  # collect relevant search items on current page
  new_items <- scrape_regpk_links(remote_driver)
  retrieved_items <- retrieved_items %>% append(new_items)

  i <- 0
-  max_pages = 0
-  while (TRUE) {
+  max_pages = 2
+  while (TRUE) { # press next unitil not possible or max pages reached 
    
    link_next_page <- remote_driver$findElement(value = '//li[@class = "bpa-pager-next"]/a')
    
@@ -47,7 +48,7 @@ out <- tryCatch({
      break
    }
    link_next_page$clickElement()
-    
+    # wait some time again
    randomized_sleep()
    
    # collect relevant search items on current page
@@ -59,16 +60,16 @@ out <- tryCatch({
  # close browser, rest uses xml2 and rvest from previous exercise
  remote_driver$close()
  
+  # scrape all regpks with links obtained above
  retrieved_regpks = list()
  for (item in retrieved_items){
-    print(item[[1]])
+    #print(item[[1]])
    retrieved_regpks <- retrieved_regpks %>% append(list(scrape_regpk_current(item[[1]])))
  }
  
+  # write result into json file
  retrieved_regpks %>% toJSON( encoding="UTF-8", pretty = TRUE, auto_unbox=TRUE) %>% readr::write_file("./result.json")
  
-
-
 },
 error=function(cond) {
  message("Here's the original error message:")