I am pretty new to R. I scraped a website that required login yesterday, the page is xml format like below.
<result status="success">
<code>1</code>
<note>success</note>
<teacherList>
<teacher id="D95">
<name>Mary</name>
<department id="420">
<name>Math</name>
</department>
<department id="421">
<name>Statistics</name>
</department>
</teacher>
<teacher id="D73">
<name>Adam</name>
<department id="412">
<name>English</name>
</department>
</teacher>
</teacherList>
</result>
Recently I just Converted an XML to a list.
library(XML)
library(rvest)
library(plyr)
library(dplyr)
library(httr)
library(pipeR)
library(xml2)
url.address <- "http://xxxxxxxxxxxxxxxxx"
session <-html_session(url.address)
form <-html_form(read_html(url.address))[[1]]
filled_form <- set_values(form,
"userid" = "id",
"Password" = "password")
s <- submit_form(session,filled_form)
z = read_xml(s$response)
z1 = as_list(z)
z2 <- z1$teacherList
Now I need to extract data from a list and make it as a data frame. By the way, some people belong to 2 departments, but some only belong to 1. A part of the list z2 looks like below:
z2[[1]]
$name
$name[[1]]
[1] "Mary"
$department
$department$name
$department$name[[1]]
[1] "Math"
attr(,"id")
[1] "420"
$department
$department$name
$department$name[[1]]
[1] "statistics"
attr(,"id")
[1] "421"
attr(,"id")
[1] "D95236"
When I extracted them one by one, it took too long:
attr(z2[[1]],"id")
"D95"
z2[[1]][[1]][[1]]
"Mary"
z2[[1]][[2]][[1]][[1]]
"Math"
attr(z2[[1]][[2]], "id")
"420"
z2[[1]][[3]][[1]][[1]]
"statistics"
attr(z2[[1]][[3]], "id")
"421"
attr(z2[[2]],"id")
"D73"
z2[[2]][[1]][[1]]
"Adam"
z2[[2]][[2]][[1]][[1]]
"English"
attr(z2[[2]][[2]],"id")
"412"
So I tried to write a loop:
for (x in 1:2){
for (y in 2:3){
a <- attr(z2[[x]],"id")
b <- z2[[x]][[1]][[1]]
d <- z2[[x]][[y]][[1]][[1]]
e <- attr(z2[[x]][[y]],"id")
g <- cbind(print(a),print(b),print(d),print(e))
}}
but it doesn't work at all since some of the people only belong to one department. The result I expected:
Any advice would be appreciated!
dput(head(z2, 10))
structure(list(teacher = structure(list(name = list("Mary"),
department = structure(list(name = list("Math")), .Names = "name", id = "420"),
department = structure(list(name = list("statistics")), .Names = "name", id = "421")), .Names = c("name",
"department", "department"), id = "D95"), teacher = structure(list(
name = list("Adam"), department = structure(list(name = list(
"English")), .Names = "name", id = "412")), .Names = c("name",
"department"), id = "D73"), teacher = structure(list(name = list(
"Kevin"), department = structure(list(name = list("Chinese")), .Names = "name", id = "201")), .Names = c("name",
"department"), id = "D101"), teacher = structure(list(name = list(
"Nana"), department = structure(list(name = list("Science")), .Names = "name", id = "205")), .Names = c("name",
"department"), id = "D58"), teacher = structure(list(name = list(
"Nelson"), department = structure(list(name = list("Music")), .Names = "name", id = "370")), .Names = c("name",
"department"), id = "D14"), teacher = structure(list(name = list(
"Esther"), department = structure(list(name = list("Medicine")), .Names = "name", id = "361")), .Names = c("name",
"department"), id = "D28"), teacher = structure(list(name = list(
"Mia"), department = structure(list(name = list("Chemistry")), .Names = "name", id = "326")), .Names = c("name",
"department"), id = "D17"), teacher = structure(list(name = list(
"Jack"), department = structure(list(name = list("German")), .Names = "name", id = "306")), .Names = c("name",
"department"), id = "D80"), teacher = structure(list(name = list(
"Tom"), department = structure(list(name = list("French")), .Names = "name", id = "360")), .Names = c("name",
"department"), id = "D53"), teacher = structure(list(name = list(
"Allen"), department = structure(list(name = list("Spanish")), .Names = "name", id = "322")), .Names = c("name",
"department"), id = "D18")), .Names = c("teacher", "teacher",
"teacher", "teacher", "teacher", "teacher", "teacher", "teacher", "teacher",
"teacher"))