0

I can successfully scrape one id at a time. And maybe as someone new in R, it's simple to scrape them separately and combine them after, but I really want to learn if I can just make a loop and do it automatically. The format from the website is the same, but every IDs have different length (but this is not the problem in this case).

Here is my code if I only scrape by one id:

library(XML)
library(rvest)
library(plyr)
library(dplyr)
library(httr)
library(data.table)
library(pipeR)
library(xml2)

url <- "XXXXXXXXXXX"
session <-html_session(url)
form  <-html_form(read_html(url))[[1]]
filled_form <- set_values(form,
                          "id" = "S1",
                          "start" = "2017-01-17",
                          "end" = "2017-02-03",
                          "Password" = "lll")
s <- submit_form(session,filled_form)
z = read_xml(s$response)
z1 = as_list(z)
z2 <-z1[which(names(z1)=="scheduleList")]

result <- data.frame()
for (i in 2:length(z2[[1]])){               
      row <- cbind(
        teacher=z2[[1]][[1]][[1]][[1]],   
        t_id=attr(z2[[1]][[1]],"id"), 
        Date=attr(z2[[1]][[i]],"date"),     
        class=z2[[1]][[i]][[1]][[1]][[1]][[1]],
        c_id=attr(z2[[1]][[i]][[1]][[1]],"id"), 
        c_status=attr(z2[[1]][[i]][[1]][[1]],"status"),
        score=attr(z2[[1]][[i]][[1]],"id"),            
        People=z2[[1]][[i]][[1]][[1]][[2]][[1]],    
        department=z2[[1]][[i]][[1]][[1]][[3]][[1]][[1]],        
        d_id=attr(z2[[1]][[i]][[1]][[1]][[3]],"id")   
      )
      result <- rbind(result, row)
}

S1's structure looks like:

structure(list(
scheduleList = structure(list(
teacher = structure(list(name = list("Mary")), .Names = "name", id = "S1"), 
schedule = structure(list(
score = structure(list(
class = structure(list(name = list("312c"), people = list("129"), 
department = structure(list(name = list("English")), .Names = "name", id = "302f")), 
.Names = c("name", "people", "department"), id = "312", status = "-4")), 
.Names = "class", id = "1")), 
.Names = "score", date = "2017-01-18"), 
schedule = structure(list(
score = structure(list(
class = structure(list(name = list("316c"), people = list("87"), 
department = structure(list(name = list("English")), .Names = "name", id = "302f")), 
.Names = c("name", "people", "department"), id = "316", status = "-2")), 
.Names = "class", id = "2")), 
.Names = "score", date = "2017-01-30")), 
.Names = c("teacher", "schedule", "schedule"), from = "2017-01-17", to = "2017-02-03")), 
.Names = "scheduleList")

S1's xml format from the website:

<result status="success">
  <code>1</code>
  <note>success</note>
  <scheduleList from="2017-01-17" to="2017-02-03">
    <teacher id="S1">
      <name>Mary</name>
    </teacher>
    <schedule date="2017-01-18">
      <score id="1">
        <class id="312" status="-4">
          <name>312C</name>
          <people>129</people>
          <department id="302f">
            <name>English</name>
          </department>
        </class>
      </score>
    </schedule>
    <schedule date="2017-01-30">
      <score id="2">
        <class id="316" status="-2">
          <name>316c</name>
          <people>87</people>
          <department id="302f">
            <name>English</name>
          </department>
        </class>
      </score>
    </schedule>
  </scheduleList>
</result>

Here is the loop I tried to do:

    url <- "XXXXXXXXXXX"
        session <-html_session(url)
        form  <-html_form(read_html(url))[[1]]
        for (i in 1:2){
          d=c("S1","S2")
          filled_form[i] <- set_values(form,
                                       "id" = d[i],
                                       "start" = "2017-01-17",
                                       "end" = "2017-02-03",
                                       "Password" = "lll")
          s[i] <- submit_form(session,filled_form[i])
        }

    Error in filled_form[i] <- set_values(form, id = d[i],  : 
      object 'filled_form' not found

It worked perfectly when I did it twice like this in an ineffective way:

    url <- "XXXXXXXXXXX"
    session <-html_session(url)
    form  <-html_form(read_html(url))[[1]]
    filled_form <- set_values(form,
                              "id" = "S1",
                              "start" = "2017-01-17",
                              "end" = "2017-02-03",
                              "Password" = "lll")
    s <- submit_form(session,filled_form)
    z = read_xml(s$response)
    z1 = as_list(z)
    z2 <-z1[which(names(z1)=="scheduleList")]

    result <- data.frame()
    for (i in 2:length(z2[[1]])){               
          row <- cbind(
            teacher=z2[[1]][[1]][[1]][[1]],   
            t_id=attr(z2[[1]][[1]],"id"), 
            Date=attr(z2[[1]][[i]],"date"),     
            class=z2[[1]][[i]][[1]][[1]][[1]][[1]],
            c_id=attr(z2[[1]][[i]][[1]][[1]],"id"), 
            c_status=attr(z2[[1]][[i]][[1]][[1]],"status"),
            score=attr(z2[[1]][[i]][[1]],"id"),            
            People=z2[[1]][[i]][[1]][[1]][[2]][[1]],    
            department=z2[[1]][[i]][[1]][[1]][[3]][[1]][[1]],        
            d_id=attr(z2[[1]][[i]][[1]][[1]][[3]],"id")   
          )
          result <- rbind(result, row)
    }


    filled_form1 <- set_values(form,
                              "id" = "S2",
                              "start" = "2017-01-17",
                              "end" = "2017-02-03",
                              "Password" = "lll")
    s1<- submit_form(session,filled_form1)
    z1 = read_xml(s1$response)
    z11 = as_list(z1)
    z21 <-z11[which(names(z11)=="scheduleList")]

    result1 <- data.frame()
    for (i in 2:length(z21[[1]])){               
      row1 <- cbind(
        teacher=z2[[1]][[1]][[1]][[1]],   
            t_id=attr(z2[[1]][[1]],"id"), 
            Date=attr(z2[[1]][[i]],"date"),     
            class=z2[[1]][[i]][[1]][[1]][[1]][[1]],
            c_id=attr(z2[[1]][[i]][[1]][[1]],"id"), 
            c_status=attr(z2[[1]][[i]][[1]][[1]],"status"),
            score=attr(z2[[1]][[i]][[1]],"id"),            
            People=z2[[1]][[i]][[1]][[1]][[2]][[1]],    
            department=z2[[1]][[i]][[1]][[1]][[3]][[1]][[1]],        
            d_id=attr(z2[[1]][[i]][[1]][[1]][[3]],"id")   
      )
      result1 <- rbind(result1, row1)
    }
    result1
    df <- rbind(result,result1) 
Ching
  • 135
  • 1
  • 9
  • Are you able to share the url as there may be ways to more optimally extract the relevant data using xml tools – user20650 Sep 10 '17 at 12:31
  • @user20650 Hi! I won't be able to provide the url, sorry. Plus, you need to buy an account to login. The xml format website looks exactly the same like I provide above, but S1 has 2 dates in class, S2 has 5 dates in class, the length is slightly different. – Ching Sep 10 '17 at 14:27
  • @user20650 Just uploaded my ineffective code above – Ching Sep 10 '17 at 14:48

1 Answers1

2

you need to make objects filled_form and s in the code prior to storing values in it. You have them in your original code but not in the loop.

url <- "XXXXXXXXXXX"
session <-html_session(url)
form  <-html_form(read_html(url))[[1]]
filled_form <- list()
s <- list()
for (i in 1:2){
  d=c("S1","S2")
  filled_form[[i]] <- set_values(form,
                               "id" = d[i],
                               "start" = "2017-01-17",
                               "end" = "2017-02-03",
                               "Password" = "lll")
  s[[i]] <- submit_form(session,filled_form[i])
}
D.sen
  • 938
  • 5
  • 14
  • You can also optimize this by creating a user defined function and passing an `ids` list through the `lapply` function. – D.sen Sep 09 '17 at 17:21
  • it didn't work, it said Error: Could not find possible submission target. In addition: Warning message: In filled_form[i] <- set_values(form, "id" = d[i], : number of items to replace is not a multiple of replacement length – Ching Sep 10 '17 at 03:41
  • I've updated the answer above to try the loop using double brackets around return lists. Try this to see if it helps. – D.sen Sep 10 '17 at 15:00
  • yeah it works! Amazing :) Why does it ude double loops? By the way, do I need to put the below code in the loops as well? Cause if I want to do read_xml function, there are still two to do, like s[[1]]$response and s[[2]]$response. Thanks a lot! – Ching Sep 10 '17 at 15:50
  • Then `z[[i]] <- read_html(s[[i]]$response)` will work if you initialize `z` as a list prior to the loop like `s` and `filled_form`. The difference between single and double brackets is a large subject on its own... I'd check out related SO posts like https://stackoverflow.com/questions/1169456/the-difference-between-and-notations-for-accessing-the-elements-of-a-lis – D.sen Sep 10 '17 at 15:58
  • I just finished it! You information is very helpful. Thanks :) – Ching Sep 10 '17 at 17:02