0

A website contains some reviews of books, which I would like to scrape with rvest. It's possible to get the data like this:

library(rvest)
library(purrr)
library(tibble)
library(tidyr)
library(dplyr)

result_list <-
  lapply(1:2, function(i) {
    url <- paste0("http://www.deutschlandradiokultur.de/buchkritik.949.de.html?drbm:page=",         i)
    parse_url <- 
      url %>%
      xml2::read_html()
    parse_page <-
      list(page = parse_url %>% html_nodes("span.drk-paginationanzahl") %>% html_text(),
           date = parse_url %>% html_nodes(".drk-container") %>% html_nodes(".drk-sendungdatum") %>% html_text(),
           text = parse_url %>% html_nodes(".drk-container") %>% html_nodes(".drk-overline") %>% html_text(),
           stringsAsFactors=FALSE) %>%
      rbind()
  })

The length of "date" differ sometimes with "text", so I used list. Now I struggle to convert the list into a dataframe. Do you have some hints for me to converting the list? Maybe there is a more elegant way for webscrape to avoid these... The dataframe should have the columns "page", "date" and "text". (in a next step I split the content of text in author and title)

I tried the approaches:

result_df1 <-
  as.data.frame(do.call(rbind, result_list))

result_df2 <-
  as.data.frame(do.call(rbind, lapply(result_list, data.frame, stringsAsFactors=FALSE)))

result_df3 <-
  as.data.frame(Reduce( rbind, lapply(result_list, unlist) ))

result_df4 <-
  as.data.frame(lapply(result_list, unlist))

result_df5 <-
  lapply(result_list, tidyr::unnest)

result_df6 <-
  result_list %>% purrr::dmap(unlist)

result_df7 <-
  result_list %>%
  unlist(recursive = FALSE) %>%
  tibble::enframe() %>%
  unnest()

In result_df1 and result_df2, the dataframe has a list in each cell. How it is possible to unlist these by column? I think a big problem is that the length differed per list element. How can I handle this?

Example1 is similar to my problem with different length in the list. With equal length (example2) I struggle with a convertion to a dataframe too.

example1 <- 
  list(structure(list("page 1/490",
                      c("a", "b", "c", "d"), 
                      c("author1: \"title1\"", "author2: \"title2\"", "author3: \"title3\"", "author4: \"title4\""), 
                      FALSE),   
                 .Dim = c(1L, 4L), 
                 .Dimnames = list(".", c("page", "date", "text", "stringsAsFactors"))), 
       structure(list("page 2/490", 
                      c("e", "f", "g"), 
                      c("author5: \"title5\"", "author6: \"title6\"", "author7: \"title7\"", "author8: \"title8\""),
                      FALSE),   
                 .Dim = c(1L, 4L), 
                 .Dimnames = list(".", c("page", "date", "text", "stringsAsFactors")))

)


example2 <- 
  list(structure(list(c("a", "b", "c", "d"), 
                      c("author1: \"title1\"", "author2: \"title2\"", "author3: \"title3\"", "author4: \"title4\""), 
                      FALSE),   
                 .Dim = c(1L, 3L), 
                 .Dimnames = list(".", c("date", "text", "stringsAsFactors"))), 
       structure(list(c("e", "f", "g", "h"), 
                      c("author5: \"title5\"", "author6: \"title6\"", "author7: \"title7\"", "author8: \"title8\""),
                      FALSE),   
                 .Dim = c(1L, 3L), 
                 .Dimnames = list(".", c("date", "text", "stringsAsFactors")))

  )
Alexander
  • 73
  • 7
  • 2
    You should provide a [reproducible example](http://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example) (ideally one that doesn't involved web scraping) and provide the desired output to make it more clear what you are trying to do. – MrFlick Apr 24 '17 at 23:00
  • On the second page, one of the `.drk-sendungdatum` elements is just not on the page. Because there's no enclosing tags for each set, it's a bit of a pain to figure out where the `NA` needs to go. – alistaire Apr 24 '17 at 23:35
  • 1
    As alistaire notes there seem to be structural problem with the pages. I tried looking at it but could not find a consistent way. Did you notice they publish RSS links? (look at the bottom of the web page) this may give you a more consistent XML to process, – Andrew Lavers Apr 25 '17 at 00:19
  • @MrFlick I have added an example. – Alexander Apr 25 '17 at 22:23

1 Answers1

3

On the second page, one of the .drk-sendungdatum items is missing from its corresponding item, so there are an uneven number of elements so a data.frame can't be constructed. Because there are no enclosing tags, it is hard to figure out where an NA should go, but it is relatively easy to subset out the extra .drk-overline item by looking for a span immediately before (+) the enclosing article tag:

library(tidyverse)
library(rvest)

pages <- 1:2 %>% 
    paste0("http://www.deutschlandradiokultur.de/buchkritik.949.de.html?drbm:page=", .) %>% 
    map(read_html)

result_list <- pages %>% 
    map(html_nodes, '.drk-container') %>%
    map_df(~data_frame(page = .x %>% html_node("span.drk-paginationanzahl") %>% html_text(),
                       date = .x %>% html_nodes(".drk-sendungdatum") %>% html_text(),
                       text = .x %>% html_nodes("span + article .drk-overline") %>% html_text()))

If you really want, you can reparse pages to add the problematic observation, though it takes a little work:

result_list <- pages %>% 
    map_df(~list(page = .x %>% html_node('span.drk-paginationanzahl') %>% html_text(), 
                 date = NA, 
                 text = .x %>% html_node('.drk-container :not(span) + article .drk-overline') %>% html_text())) %>% 
    drop_na(text) %>% 
    bind_rows(result_list)

Now note the NA value of date at the top:

result_list
#> # A tibble: 50 × 3
#>           page                   date
#>          <chr>                  <chr>
#> 1  Seite 2/490                   <NA>
#> 2  Seite 1/490 Sendung vom 24.04.2017
#> 3  Seite 1/490 Sendung vom 21.04.2017
#> 4  Seite 1/490 Sendung vom 20.04.2017
#> 5  Seite 1/490 Sendung vom 19.04.2017
#> 6  Seite 1/490 Sendung vom 18.04.2017
#> 7  Seite 1/490 Sendung vom 15.04.2017
#> 8  Seite 1/490 Sendung vom 13.04.2017
#> 9  Seite 1/490 Sendung vom 12.04.2017
#> 10 Seite 1/490 Sendung vom 11.04.2017
#> # ... with 40 more rows, and 1 more variables: text <chr>
alistaire
  • 42,459
  • 4
  • 77
  • 117