I can successfully scrape one id at a time. And maybe as someone new in R, it's simple to scrape them separately and combine them after, but I really want to learn if I can just make a loop and do it automatically. The format from the website is the same, but every IDs have different length (but this is not the problem in this case).
Here is my code if I only scrape by one id:
library(XML)
library(rvest)
library(plyr)
library(dplyr)
library(httr)
library(data.table)
library(pipeR)
library(xml2)
url <- "XXXXXXXXXXX"
session <-html_session(url)
form <-html_form(read_html(url))[[1]]
filled_form <- set_values(form,
"id" = "S1",
"start" = "2017-01-17",
"end" = "2017-02-03",
"Password" = "lll")
s <- submit_form(session,filled_form)
z = read_xml(s$response)
z1 = as_list(z)
z2 <-z1[which(names(z1)=="scheduleList")]
result <- data.frame()
for (i in 2:length(z2[[1]])){
row <- cbind(
teacher=z2[[1]][[1]][[1]][[1]],
t_id=attr(z2[[1]][[1]],"id"),
Date=attr(z2[[1]][[i]],"date"),
class=z2[[1]][[i]][[1]][[1]][[1]][[1]],
c_id=attr(z2[[1]][[i]][[1]][[1]],"id"),
c_status=attr(z2[[1]][[i]][[1]][[1]],"status"),
score=attr(z2[[1]][[i]][[1]],"id"),
People=z2[[1]][[i]][[1]][[1]][[2]][[1]],
department=z2[[1]][[i]][[1]][[1]][[3]][[1]][[1]],
d_id=attr(z2[[1]][[i]][[1]][[1]][[3]],"id")
)
result <- rbind(result, row)
}
S1's structure looks like:
structure(list(
scheduleList = structure(list(
teacher = structure(list(name = list("Mary")), .Names = "name", id = "S1"),
schedule = structure(list(
score = structure(list(
class = structure(list(name = list("312c"), people = list("129"),
department = structure(list(name = list("English")), .Names = "name", id = "302f")),
.Names = c("name", "people", "department"), id = "312", status = "-4")),
.Names = "class", id = "1")),
.Names = "score", date = "2017-01-18"),
schedule = structure(list(
score = structure(list(
class = structure(list(name = list("316c"), people = list("87"),
department = structure(list(name = list("English")), .Names = "name", id = "302f")),
.Names = c("name", "people", "department"), id = "316", status = "-2")),
.Names = "class", id = "2")),
.Names = "score", date = "2017-01-30")),
.Names = c("teacher", "schedule", "schedule"), from = "2017-01-17", to = "2017-02-03")),
.Names = "scheduleList")
S1's xml format from the website:
<result status="success">
<code>1</code>
<note>success</note>
<scheduleList from="2017-01-17" to="2017-02-03">
<teacher id="S1">
<name>Mary</name>
</teacher>
<schedule date="2017-01-18">
<score id="1">
<class id="312" status="-4">
<name>312C</name>
<people>129</people>
<department id="302f">
<name>English</name>
</department>
</class>
</score>
</schedule>
<schedule date="2017-01-30">
<score id="2">
<class id="316" status="-2">
<name>316c</name>
<people>87</people>
<department id="302f">
<name>English</name>
</department>
</class>
</score>
</schedule>
</scheduleList>
</result>
Here is the loop I tried to do:
url <- "XXXXXXXXXXX"
session <-html_session(url)
form <-html_form(read_html(url))[[1]]
for (i in 1:2){
d=c("S1","S2")
filled_form[i] <- set_values(form,
"id" = d[i],
"start" = "2017-01-17",
"end" = "2017-02-03",
"Password" = "lll")
s[i] <- submit_form(session,filled_form[i])
}
Error in filled_form[i] <- set_values(form, id = d[i], :
object 'filled_form' not found
It worked perfectly when I did it twice like this in an ineffective way:
url <- "XXXXXXXXXXX"
session <-html_session(url)
form <-html_form(read_html(url))[[1]]
filled_form <- set_values(form,
"id" = "S1",
"start" = "2017-01-17",
"end" = "2017-02-03",
"Password" = "lll")
s <- submit_form(session,filled_form)
z = read_xml(s$response)
z1 = as_list(z)
z2 <-z1[which(names(z1)=="scheduleList")]
result <- data.frame()
for (i in 2:length(z2[[1]])){
row <- cbind(
teacher=z2[[1]][[1]][[1]][[1]],
t_id=attr(z2[[1]][[1]],"id"),
Date=attr(z2[[1]][[i]],"date"),
class=z2[[1]][[i]][[1]][[1]][[1]][[1]],
c_id=attr(z2[[1]][[i]][[1]][[1]],"id"),
c_status=attr(z2[[1]][[i]][[1]][[1]],"status"),
score=attr(z2[[1]][[i]][[1]],"id"),
People=z2[[1]][[i]][[1]][[1]][[2]][[1]],
department=z2[[1]][[i]][[1]][[1]][[3]][[1]][[1]],
d_id=attr(z2[[1]][[i]][[1]][[1]][[3]],"id")
)
result <- rbind(result, row)
}
filled_form1 <- set_values(form,
"id" = "S2",
"start" = "2017-01-17",
"end" = "2017-02-03",
"Password" = "lll")
s1<- submit_form(session,filled_form1)
z1 = read_xml(s1$response)
z11 = as_list(z1)
z21 <-z11[which(names(z11)=="scheduleList")]
result1 <- data.frame()
for (i in 2:length(z21[[1]])){
row1 <- cbind(
teacher=z2[[1]][[1]][[1]][[1]],
t_id=attr(z2[[1]][[1]],"id"),
Date=attr(z2[[1]][[i]],"date"),
class=z2[[1]][[i]][[1]][[1]][[1]][[1]],
c_id=attr(z2[[1]][[i]][[1]][[1]],"id"),
c_status=attr(z2[[1]][[i]][[1]][[1]],"status"),
score=attr(z2[[1]][[i]][[1]],"id"),
People=z2[[1]][[i]][[1]][[1]][[2]][[1]],
department=z2[[1]][[i]][[1]][[1]][[3]][[1]][[1]],
d_id=attr(z2[[1]][[i]][[1]][[1]][[3]],"id")
)
result1 <- rbind(result1, row1)
}
result1
df <- rbind(result,result1)