-3

I have to export a graphml file out of R igraph to add column values by hand. When I want to import the graphml file again it has to be proper UTF-8 and valid xml. So I convert the the data before saving into UTF-8 using iconv() as you can see in the for loop of my code below

   library(igraph)
edges <- read.csv2("https://www.dropbox.com/s/p8e7hcck0d4nnrp/Subgraph_nowvalid.graphml?dl=0", header=TRUE, quote="");
amount <- nrow(edges);
amount;
sources <- data.frame(Vertexname = character(amount), Description = character(amount), Follower = numeric(amount), Friends = numeric(amount), Favourites = numeric(amount), Statuses = numeric(amount), ProfileAge = numeric(amount), Listed = numeric(amount), Timestamp = numeric(amount), OutDegree = numeric(amount), InDegree = numeric(amount), WOutDegree = numeric(amount), WInDegree = numeric(amount));
targets <- data.frame(Vertexname = character(amount), Description = character(amount), Follower = numeric(amount), Friends = numeric(amount), Favourites = numeric(amount), Statuses = numeric(amount), ProfileAge = numeric(amount), Listed = numeric(amount), Timestamp = numeric(amount), OutDegree = numeric(amount), InDegree = numeric(amount), WOutDegree = numeric(amount), WInDegree = numeric(amount));

for (i in 1:ncol(edges)) {
    edges[,i] <- iconv(edges[,i], to="UTF-8", sub="");
    if (is.character(edges[,i])) {
        edges[,i] <- gsub("[[:cntrl:]]", "", edges[,i])
    }
}

sources[,1] <- edges[,1];
sources[,2:8] <- NA;
sources[,9] <- edges[,4];
sources[,10:13] <- NA;
targets[,1] <- edges[,2];
targets[,2] <- edges[,7];
targets[,3] <- edges[,8];
targets[,4] <- edges[,9];
targets[,5] <- edges[,10];
targets[,6] <- edges[,11];
targets[,7] <- edges[,12];
targets[,8] <- edges[,13];
targets[,9:13] <- NA;

print("REPORT: vertices data frames filled")

sources <- unique(sources);
targets <- unique(targets);
print("REPORT: Duplicated sources and targets removed");

nodes <- within(merge(sources, targets, by="Vertexname", all=TRUE), {
            Description <- ifelse(is.na(Description.x), paste(Description.y), Description.x); Description.x = NULL; Description.y = NULL; 
            Follower <- ifelse(is.na(Follower.x), Follower.y, Follower.x); Follower.x = NULL; Follower.y = NULL; 
            Friends <- ifelse(is.na(Friends.x), Friends.y, Friends.x); Friends.x = NULL; Friends.y = NULL;
            Favourites <- ifelse(is.na(Favourites.x), Favourites.y, Favourites.x); Favourites.x = NULL; Favourites.y = NULL;
            Statuses <- ifelse(is.na(Statuses.x), Statuses.y, Statuses.x); Statuses.x = NULL; Statuses.y = NULL;
            ProfileAge <- ifelse(is.na(ProfileAge.x), ProfileAge.y, ProfileAge.x); ProfileAge.x = NULL; ProfileAge.y = NULL;
            Listed <- ifelse(is.na(Listed.x), Listed.y, Listed.x); Listed.x = NULL; Listed.y = NULL;
            Timestamp <- ifelse(is.na(Timestamp.y), Timestamp.x, Timestamp.y); Timestamp.x = NULL; Timestamp.y = NULL;
            OutDegree <- ifelse(is.na(OutDegree.x), OutDegree.y, OutDegree.x); OutDegree.x = NULL; OutDegree.y = NULL;
            InDegree <- ifelse(is.na(InDegree.x), InDegree.y, InDegree.x); InDegree.x = NULL; InDegree.y = NULL;
            WOutDegree <- ifelse(is.na(WOutDegree.x), WOutDegree.y, WOutDegree.x); WOutDegree.x = NULL; WOutDegree.y = NULL;
            WInDegree <- ifelse(is.na(WInDegree.x), WInDegree.y, WInDegree.x); WInDegree.x = NULL; WInDegree.y = NULL});
print("REPORT: Sources and Targets merged");

nodes <- subset(nodes, !duplicated(nodes$Vertexname));
print("REPORT: Duplicated vertices removed");

nrow(nodes);

edges <- edges[complete.cases(edges[,1:2]),];
nodes <- nodes[complete.cases(nodes[,1]),];
print("REPORT: Invalid edges and nodes removed");

g <- graph.data.frame(edges, directed=TRUE, nodes);
print("REPORT: Graph created");

outdegrees <- degree(g, v=V(g), mode="out");
indegrees <- degree(g, v=V(g), mode="in");
woutdegrees <- graph.strength(g, v=V(g), mode="out");
windegrees <- graph.strength(g, v=V(g), mode="in");
g <- set.vertex.attribute(g, "OutDegree", V(g), outdegrees);
g <- set.vertex.attribute(g, "InDegree", V(g), indegrees);
g <- set.vertex.attribute(g, "WOutDegree", V(g), woutdegrees);
g <- set.vertex.attribute(g, "WInDegree", V(g), windegrees);
print("REPORT: Degree calculated and added as vertex attribute");

# Filter

nodes <- get.data.frame(g, "vertices");
nodes <- nodes[order(nodes$OutDegree, decreasing = TRUE),];
nrow(nodes);
minOutDegree <- nodes[1335,"OutDegree"]; # 1335
minOutDegree;
nodes <- nodes[order(nodes$InDegree, decreasing = TRUE),];
minInDegree <- nodes[1335,"InDegree"];
minInDegree;

nodes2 <- subset(nodes, nodes$OutDegree >= minOutDegree | nodes$InDegree >= minInDegree);
nrow(nodes2);
nodes3 <- subset(nodes, nodes$OutDegree >= minOutDegree & nodes$InDegree >= minInDegree);
nrow(nodes3);

g <- set.vertex.attribute(g, "Group", V(g), NA);
g <- induced.subgraph(g, V(g)$OutDegree >= minOutDegree | V(g)$InDegree >= minInDegree);
length(E(g));
length(V(g));
g <- induced.subgraph(g, V(g)$OutDegree > 0 & V(g)$InDegree > 0);
length(E(g));
length(V(g));
g <- induced.subgraph(g, V(g)$OutDegree > (V(g)$InDegree / 3));
length(E(g));
length(V(g));
write.graph(g, "SomePath");
print("REPORT: Subgraph Test saved");

When I import the graphml file with read.graph again I get the error:

Error in .Call("R_igraph_read_graph_graphml", file, as.numeric(index),  : 
  At foreign-graphml.c:1202 :  
å
, Parse error

Therefore I used XMLValidatorBuddy to validate the graphml file (UTF-8 is chosen as used encoding in the dropdown field, but the error occurs no matter which encoding is selected). This is where I got the error:

invalid byte '?' at position 2 of 2-byte sequence

According to XMLValidatorBuddy the error occurs in line 4278.

The answers to this question don't help me really since I should have a UTF-8 encoded graphml file due to the conversion in R.

Community
  • 1
  • 1
  • 1
    Can you make a *minimal* example that reproduces the problem? This code dump of yours depends on a file we don't have and then does a hundred things, any one of which might be the problem. And the only link is to a 6Mb XML file, which I can't be bothered downloading. Can you make a tiny graph and generate a similarly broken graphml file? Surely it would only take a few lines, a dozen at most. – Spacedman Dec 29 '14 at 18:01
  • I tried to narrow down the graph so that only the edges are left that are related to the node that maybe causes the problem because line 4278 contains data of that node. But I got no error. I don't know how to reproduce the error without knowing the source... – herrschedel Dec 29 '14 at 19:29
  • Have you deleted the linked file? – Spacedman Dec 30 '14 at 10:14
  • Oh sorry, I forgot to udpate the hyperlink when I changed the file. The link behind "graphml file" in the first line now points to the same file as read.graph in the second code line. Problem got solved in the meantime – herrschedel Dec 31 '14 at 13:17

1 Answers1

0

This line is definitely incorrect:

edges[,i] <- gsub("[[:ctrl:]]", "", edges[,i])

I see that its purpose would be to strip away any disallowed control characters from the edge attributes so you don't get any trouble further on with the GraphML writer, but [[:ctrl:]] should be [[:cntrl:]]. (Actually, my R version complains when it sees [[:ctrl:]], but maybe yours doesn't).

Also, I would avoid messing with individual characters of the string after having converted it to UTF-8. If you want to strip control characters from the string, do it before the conversion to UTF-8. Due to how UTF-8 encoding works, Unicode characters with a character code less than 128 (which contains all the control characters that you are worrying about) are left intact, and the UTF-8 encoding will not introduce any additional ASCII characters with a character code less than 128.

Tamás
  • 47,239
  • 12
  • 105
  • 124
  • thank you for the note! Obviously a typo that R unfortunately did not see. I also switched the code lines so that stripping away comes before iconv. – herrschedel Dec 29 '14 at 19:11
  • It works! The Code above now creates a graphml file that can be imported again immediatley without any errors. I relocated the code inside the for loop so that it at first converts all columns to UTF-8 and then strips away the control characters of the character vector. Thank you very much for your help! – herrschedel Dec 29 '14 at 20:25