From 83de5bda2b857c0f1503f16bc8fa53e60c7172e2 Mon Sep 17 00:00:00 2001
From: "Gallenkamp, Fabian (bax1489)"
 <476-bax1489@users.noreply.gitlab.rrz.uni-hamburg.de>
Date: Mon, 27 Jan 2020 09:56:06 +0100
Subject: [PATCH] Update exercise2.R

---
 exercise2.R | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/exercise2.R b/exercise2.R
index 6e62182..bf9cf10 100644
--- a/exercise2.R
+++ b/exercise2.R
@@ -8,6 +8,7 @@ setwd(dirname(rstudioapi::getSourceEditorContext()$path))
 source("./exercise1.R")
 
 scrape_regpk_links <- function(remote_driver) {
+  # select only links containing 'regierungspressekonferenz'
   items <- remote_driver$findElements(using = "xpath", "//a[@class='bpa-teaser']") %>%
     purrr::map(function(x){x$getElementAttribute('href')}) %>%
     purrr::keep(function(x){str_detect(x, "regierungspressekonferenz")})
@@ -24,22 +25,22 @@ rsdriver <- NULL
 out <- tryCatch({
   
   retrieved_items = list()
-  
+  # create browser instance and set up communication with webdriver (select a chrome version according to your installation, refer also to above troubleshooting to see possible values) 
   rsdriver <- rsDriver(browser=c("chrome"), chromever="79.0.3945.36",  port=8888L, )
-
+  # get webdriver
   remote_driver <- rsdriver[["client"]]
-
+  # goto first search page
   remote_driver$navigate("https://www.bundesregierung.de/breg-de/aktuelles/pressekonferenzen")
-  
-  #randomized_sleep()
+  # wait some time
+  randomized_sleep()
 
   # collect relevant search items on current page
   new_items <- scrape_regpk_links(remote_driver)
   retrieved_items <- retrieved_items %>% append(new_items)
 
   i <- 0
-  max_pages = 0
-  while (TRUE) {
+  max_pages = 2
+  while (TRUE) { # press next unitil not possible or max pages reached 
     
     link_next_page <- remote_driver$findElement(value = '//li[@class = "bpa-pager-next"]/a')
     
@@ -47,7 +48,7 @@ out <- tryCatch({
       break
     }
     link_next_page$clickElement()
-    
+    # wait some time again
     randomized_sleep()
     
     # collect relevant search items on current page
@@ -59,16 +60,16 @@ out <- tryCatch({
   # close browser, rest uses xml2 and rvest from previous exercise
   remote_driver$close()
   
+  # scrape all regpks with links obtained above
   retrieved_regpks = list()
   for (item in retrieved_items){
-    print(item[[1]])
+    #print(item[[1]])
     retrieved_regpks <- retrieved_regpks %>% append(list(scrape_regpk_current(item[[1]])))
   }
   
+  # write result into json file
   retrieved_regpks %>% toJSON( encoding="UTF-8", pretty = TRUE, auto_unbox=TRUE) %>% readr::write_file("./result.json")
   
-
-
 },
 error=function(cond) {
   message("Here's the original error message:")
-- 
GitLab