I have to export a graphml file out of R igraph to add column values by hand. When I want to import the graphml file again it has to be proper UTF-8 and valid xml. So I convert the the data before saving into UTF-8 using iconv()
as you can see in the for loop of my code below
library(igraph)
edges <- read.csv2("https://www.dropbox.com/s/p8e7hcck0d4nnrp/Subgraph_nowvalid.graphml?dl=0", header=TRUE, quote="");
amount <- nrow(edges);
amount;
sources <- data.frame(Vertexname = character(amount), Description = character(amount), Follower = numeric(amount), Friends = numeric(amount), Favourites = numeric(amount), Statuses = numeric(amount), ProfileAge = numeric(amount), Listed = numeric(amount), Timestamp = numeric(amount), OutDegree = numeric(amount), InDegree = numeric(amount), WOutDegree = numeric(amount), WInDegree = numeric(amount));
targets <- data.frame(Vertexname = character(amount), Description = character(amount), Follower = numeric(amount), Friends = numeric(amount), Favourites = numeric(amount), Statuses = numeric(amount), ProfileAge = numeric(amount), Listed = numeric(amount), Timestamp = numeric(amount), OutDegree = numeric(amount), InDegree = numeric(amount), WOutDegree = numeric(amount), WInDegree = numeric(amount));
for (i in 1:ncol(edges)) {
edges[,i] <- iconv(edges[,i], to="UTF-8", sub="");
if (is.character(edges[,i])) {
edges[,i] <- gsub("[[:cntrl:]]", "", edges[,i])
}
}
sources[,1] <- edges[,1];
sources[,2:8] <- NA;
sources[,9] <- edges[,4];
sources[,10:13] <- NA;
targets[,1] <- edges[,2];
targets[,2] <- edges[,7];
targets[,3] <- edges[,8];
targets[,4] <- edges[,9];
targets[,5] <- edges[,10];
targets[,6] <- edges[,11];
targets[,7] <- edges[,12];
targets[,8] <- edges[,13];
targets[,9:13] <- NA;
print("REPORT: vertices data frames filled")
sources <- unique(sources);
targets <- unique(targets);
print("REPORT: Duplicated sources and targets removed");
nodes <- within(merge(sources, targets, by="Vertexname", all=TRUE), {
Description <- ifelse(is.na(Description.x), paste(Description.y), Description.x); Description.x = NULL; Description.y = NULL;
Follower <- ifelse(is.na(Follower.x), Follower.y, Follower.x); Follower.x = NULL; Follower.y = NULL;
Friends <- ifelse(is.na(Friends.x), Friends.y, Friends.x); Friends.x = NULL; Friends.y = NULL;
Favourites <- ifelse(is.na(Favourites.x), Favourites.y, Favourites.x); Favourites.x = NULL; Favourites.y = NULL;
Statuses <- ifelse(is.na(Statuses.x), Statuses.y, Statuses.x); Statuses.x = NULL; Statuses.y = NULL;
ProfileAge <- ifelse(is.na(ProfileAge.x), ProfileAge.y, ProfileAge.x); ProfileAge.x = NULL; ProfileAge.y = NULL;
Listed <- ifelse(is.na(Listed.x), Listed.y, Listed.x); Listed.x = NULL; Listed.y = NULL;
Timestamp <- ifelse(is.na(Timestamp.y), Timestamp.x, Timestamp.y); Timestamp.x = NULL; Timestamp.y = NULL;
OutDegree <- ifelse(is.na(OutDegree.x), OutDegree.y, OutDegree.x); OutDegree.x = NULL; OutDegree.y = NULL;
InDegree <- ifelse(is.na(InDegree.x), InDegree.y, InDegree.x); InDegree.x = NULL; InDegree.y = NULL;
WOutDegree <- ifelse(is.na(WOutDegree.x), WOutDegree.y, WOutDegree.x); WOutDegree.x = NULL; WOutDegree.y = NULL;
WInDegree <- ifelse(is.na(WInDegree.x), WInDegree.y, WInDegree.x); WInDegree.x = NULL; WInDegree.y = NULL});
print("REPORT: Sources and Targets merged");
nodes <- subset(nodes, !duplicated(nodes$Vertexname));
print("REPORT: Duplicated vertices removed");
nrow(nodes);
edges <- edges[complete.cases(edges[,1:2]),];
nodes <- nodes[complete.cases(nodes[,1]),];
print("REPORT: Invalid edges and nodes removed");
g <- graph.data.frame(edges, directed=TRUE, nodes);
print("REPORT: Graph created");
outdegrees <- degree(g, v=V(g), mode="out");
indegrees <- degree(g, v=V(g), mode="in");
woutdegrees <- graph.strength(g, v=V(g), mode="out");
windegrees <- graph.strength(g, v=V(g), mode="in");
g <- set.vertex.attribute(g, "OutDegree", V(g), outdegrees);
g <- set.vertex.attribute(g, "InDegree", V(g), indegrees);
g <- set.vertex.attribute(g, "WOutDegree", V(g), woutdegrees);
g <- set.vertex.attribute(g, "WInDegree", V(g), windegrees);
print("REPORT: Degree calculated and added as vertex attribute");
# Filter
nodes <- get.data.frame(g, "vertices");
nodes <- nodes[order(nodes$OutDegree, decreasing = TRUE),];
nrow(nodes);
minOutDegree <- nodes[1335,"OutDegree"]; # 1335
minOutDegree;
nodes <- nodes[order(nodes$InDegree, decreasing = TRUE),];
minInDegree <- nodes[1335,"InDegree"];
minInDegree;
nodes2 <- subset(nodes, nodes$OutDegree >= minOutDegree | nodes$InDegree >= minInDegree);
nrow(nodes2);
nodes3 <- subset(nodes, nodes$OutDegree >= minOutDegree & nodes$InDegree >= minInDegree);
nrow(nodes3);
g <- set.vertex.attribute(g, "Group", V(g), NA);
g <- induced.subgraph(g, V(g)$OutDegree >= minOutDegree | V(g)$InDegree >= minInDegree);
length(E(g));
length(V(g));
g <- induced.subgraph(g, V(g)$OutDegree > 0 & V(g)$InDegree > 0);
length(E(g));
length(V(g));
g <- induced.subgraph(g, V(g)$OutDegree > (V(g)$InDegree / 3));
length(E(g));
length(V(g));
write.graph(g, "SomePath");
print("REPORT: Subgraph Test saved");
When I import the graphml file with read.graph
again I get the error:
Error in .Call("R_igraph_read_graph_graphml", file, as.numeric(index), :
At foreign-graphml.c:1202 :
å
, Parse error
Therefore I used XMLValidatorBuddy to validate the graphml file (UTF-8 is chosen as used encoding in the dropdown field, but the error occurs no matter which encoding is selected). This is where I got the error:
invalid byte '?' at position 2 of 2-byte sequence
According to XMLValidatorBuddy the error occurs in line 4278.
The answers to this question don't help me really since I should have a UTF-8 encoded graphml file due to the conversion in R.