optimise algorithm for building a graph based on node weights

Question

I am trying to improve a function to build a network based on the score calculated from some node attributes. The function tries to find the best subnetwork from a graph maximizing the product of node's attributes.

The function starts in a random node and start searching in the first neighbor, if there are some neighbors whose node's score suffice a threshold, the neighbour/s is added to the first node and the process continues until no more are added (the addition of the neighbour does not produce the desired increment in the score). If there is no node in the first neighbours that yields the increment of the score, then the function looks to the second degree neighbours. In this situation, it is very likely that there are several paths to connect the node (2nd degree neighbour), in this specific case, the chosen path will be the shortest with the highest weight (one of the nodes attribute).

I could do some paralelization of the code, although I don't know how to implement it in this type of function.

The function is the following:

build_network <-
function (G, seed, d= 2){
    net <- G
    d <- d

    score.fun<-function(g){
    Za <- sum(V(g)$weight*V(g)$RWRNodeweight)/sqrt(sum(V(g)$RWRNodeweight^2))
    k <- vcount(g)
    tmp <- genesets.length.null.stat[[as.character(k)]] # genesets.length.null.stat is a list with the median of Za and sd of Za calculated for 1000 replicates of networks of size k
    Sa <- (Za-tmp[1])/tmp[2]
    }

    best.fun<-function(in.nodes,out.nodes)  {
    score<-(-Inf); best<-character()
    for(node in out.nodes){
     subG.update<-induced.subgraph(net, c(in.nodes,node))
     if( score.fun(subG.update) > score ){
        score<-score.fun(subG.update)
        best<-node
        }
    }
    list("node"=best,"score"=score)
    }   

    subG <- induced.subgraph(net, seed)
    if (!is.connected(subG)) {          #the seed must be connected
        stop("Input seeds are disjoint")
    }

    while (TRUE) {
        in.nodes <- V(subG)$name
        node_num <- vcount(subG)
        subsum <- score.fun(subG)
        #subx <- V(subG)$name
        for (rad in 1:d) {
            tmp.neigh <- unlist(neighborhood(net, order = rad, nodes = V(subG)$name)) 
            pot.nodes <- V(net)[tmp.neigh]$name
            out.nodes <- setdiff(pot.nodes, in.nodes)
            if (length(out.nodes) == 0) break

            best_node<-best.fun(in.nodes, out.nodes) 
            new_score<-best_node$score
            best_node<-best_node$node 

            if (new_score > subsum + 0.01) {
                tmp <- unlist(lapply(best_node, function(x) node2treePath(net,V(subG)$name, x))) # node2treePath is a function to retrieve the shortest path with the highest node weights
                in.nodes <- c(tmp, V(subG)$name)
                subG <- induced.subgraph(net, in.nodes)
                break
            }
        }
        if (node_num == vcount(subG)) break
    }
    return(subG)
}

I am trying to apply this function to a graph of ~10,000 nodes. Here will be an approximation of the code for running the function

### generate some example data
library(igraph)
my_graph <- erdos.renyi.game(10000, 0.0003)
V(my_graph)$name <- 1:vcount(my_graph)
V(my_graph)$weight <- rnorm(10000)
V(my_graph)$RWRNodeweight <- runif(10000, min=0, max=0.05)

### Run the function
sublist = list()
for (node in V(G)$name) {
    subnet <- build_network(G, node, d)
    sublist[[node]] <- subnet }

EDIT: here is the dput of head(genesets.length.null.stat)

structure(list(`1` = c(1.01397367504035, 1.18858228819048), `2` = c(1.61970348041337, 1.30189433386605), `3` = c(2.11767222957028, 1.36222065695878), `4` = c(2.47710421934929, 1.36968129959296), `5` = c(2.776011866622, 1.36318885187196), `6` = c(3.16885126246671, 1.42577861995897)), .Names = c("1", "2", "3", "4", "5", "6"))

Here is the node2treePath function:

node2treePath <- function (G, Tnodes, node){

tmp.path <- get.all.shortest.paths(G, node, Tnodes)$res
tmp.l <- unlist(lapply(tmp.path, length))
index <- which(tmp.l == min(tmp.l))

tmp.path = tmp.path[index]
tmp.sum <- unlist(lapply(tmp.path, function(x)return(sum(V(G)[x]$weight))))
index <- which(tmp.sum == max(tmp.sum))

selected.path = tmp.path[index]
collect <- unlist(lapply(selected.path, function(x)return(V(G)[x]$name)))

return(collect)
}

can you make the example reproducible, e.g. `genesets.length.null.stat` is non-existent. — Jack Wasey, Oct 25 '15 at 21:15
`genesets.length.null.stat` is a list of 500 elements, which stores in each element two values (mean, and standard deviation) — user2380782, Oct 26 '15 at 11:29
@user2380782 could you provide an example of `genesets.length.null.stat`, e.g. `dput(head(genesets.length.null.stat))`, or perhaps some code that generates random `genesets.length.null.stat` objects? — josliber, Oct 27 '15 at 20:01
@josilber, actually the `genesets.length.null.stat` is just a list after running your `josilber.rcpp` function from my previous post http://stackoverflow.com/questions/33084860/sampling-subgraphs-from-different-sizes-using-igraph, e.g `genesets.length.null.stat <- lapply(1:500, function(x)josilber.rcpp(x,1000,my_graph)` and then calculate the mean and sd for each component of the list. I have added he `dput` as you suggested. Many thanks and let me know if you need anything else. — user2380782, Oct 28 '15 at 15:06
you don't define `node2treePath`. Also, `best_fun` should be `best.fun`. Even with a bounty question, you still need to write reproducible examples so people can actually run code to fix the specific problem you state. You can provide an equivalent dummy function, or the real thing. — Jack Wasey, Oct 29 '15 at 09:57
@JackWasey, I have edited the post adding the `node2treePath` function, the code works but clearly is not the fastest, thanks for your help and sorry for not posting in the better way — user2380782, Oct 29 '15 at 13:06

score 0 · Answer 1 · answered Oct 22 '15 at 16:29

Since your score function only depends on node attributes and not edge's, the solution is not unique; you might want to search for a best tree instead. If you restructure your problem so that your nodes are edges and vice-versa, you probably can just use eg Djikstra's algorithm to find the best one. That is already in the igraph package as shortest.paths().

score 0 · Answer 2 · answered Oct 29 '15 at 14:20

I can't read the R code, but based on your description: If the score threshold is constant, then this is easy to do in O(|V|+|E|+|C|^2) time, where |C| is the number of "good" components (this will be further explained shortly).

In a first pass, delete all nodes with score below the threshold. Then find all connected components in this new graph (this can be done in O(|V|+|E|) time by starting a DFS at each as-yet-unvisited node), calculate their scores by multiplying together all vertex weights in the component, and label each vertex with its component ID. This already tells you the "good" components -- the ones that don't require any 2nd-degree connections.

Suppose this produces |C| components. Create an empty hashtable H which has component-ID pairs for keys, and (length, weight) pairs for values. Now go back through each vertex v you deleted in the first pass: for each one, look at all its neighbours and record the shortest edge to each distinct component (this can be done using a length-|C| array to store the shortest edge to each component seen so far). After examining all of v's neighbours, count the number k of distinct components they fall into: if k >= 2, then v potentially should be used to connect some of these k(k-1)/2 pairs of components. For every pair of distinct components i and j that could be connected by v, update H with the weight and distance of this 2-edge connection as necessary: that is, if i and j are not yet joined together, then record that v joins them; otherwise, if they are already joined by some vertex u, only update H if v can do better (i.e., if it uses less total length and greater weight than u would). This step can be thought of as building a minimum spanning tree in a "component graph" derived from the original, pruned graph. The scores for each new "combined" component can easily be calculated as you go just by multiplying together the scores of the two constituent components.

Finally, simply return the component whose product is maximum.

Thanks for the ideas. I will look them in detail and try to implement the code, but actually never thought about using a hash — user2380782, Oct 29 '15 at 16:53

Jack Wasey · Accepted Answer · 2015-10-30T09:51:53.523

For the logic you want to do (and I imagine you may wish to change in way incompatible with the above answers) the following code is about ~~ten times~~ 30% faster. I used Rprof and profr and recoded some slow bits in trivial ways, e.g. not passing a named list pair, just an anonymous pair from one of your functions. The numerically named list with pairs of values for genesets.length.null.stat is very inefficient. I replaced it with two numeric vectors. You also call the 'V' function a lot, which was a big time consumer: as you can see, you can call it once, then query the result as needed.

# node2treePath is a function to retrieve the shortest path with the highest node weights
node2treePath_jw <- function(G, Tnodes, node){

  tmp.path <- get.all.shortest.paths(G, node, Tnodes)$res
  tmp.l <- vapply(tmp.path, length, integer(1))
  index <- which(tmp.l == min(tmp.l))

  tmp.path = tmp.path[index]
  Vg <- V(G)
  tmp.sum <- vapply(tmp.path, function(x) sum(Vg[x]$weight), numeric(1))
  index <- which(tmp.sum == max(tmp.sum))

  selected.path = tmp.path[index]
  sapply(selected.path, function(x) Vg[x]$name)
}

build_network_jw <- function(net, seed, d= 2){

  score.fun <- function(Vg, k){
    Za <- sum(Vg$weight * Vg$RWRNodeweight) / sqrt(sum(Vg$RWRNodeweight^2))
    (Za - genesets_jack_a[k]) / genesets_jack_b[k]
  }

  best.fun_jw <- function(in.nodes, out.nodes)  {
    score <- (-Inf)
    best <- character()
    for (node in out.nodes) {
      subG.update <- induced.subgraph(net, c(in.nodes,node))
      Vsgu <- V(subG.update)
      Vsgu_count <- vcount(subG.update)
      sf <- score.fun(Vsgu, Vsgu_count)
      if (sf > score) {
        score <- sf
        best <- node
      }
    }
    list(best, score)
  }

  subG <- induced.subgraph(net, seed)
  if (!is.connected(subG)) {          #the seed must be connected
    stop("Input seeds are disjoint")
  }

  while (TRUE) {
    VsubG <- V(subG)
    Vnet <- V(net)
    in.nodes <- VsubG$name
    node_num <- vcount(subG)
    subsum <- score.fun(VsubG, node_num)

    for (rad in 1:d) { # d = 2
      tmp.neigh <- unlist(neighborhood(net, order = rad, nodes = VsubG$name))
      pot.nodes <- Vnet[tmp.neigh]$name
      out.nodes <- setdiff(pot.nodes, in.nodes)
      if (length(out.nodes) == 0) break

      best_node <- best.fun_jw(in.nodes, out.nodes)
      new_score <- best_node[[2]]
      best_node <- best_node[[1]]

      if (new_score > subsum + 0.01) {
        tmp <- sapply(best_node, function(x) node2treePath_jw(net, VsubG$name, x))
        in.nodes <- c(tmp, VsubG$name)
        subG <- induced.subgraph(net, in.nodes)
        break
      }
    }
    if (node_num == vcount(subG)) break
  }
  subG
}

node2treePath <- function (G, Tnodes, node){

  tmp.path <- get.all.shortest.paths(G, node, Tnodes)$res
  tmp.l <- unlist(lapply(tmp.path, length))
  index <- which(tmp.l == min(tmp.l))

  tmp.path = tmp.path[index]
  tmp.sum <- unlist(lapply(tmp.path, function(x)return(sum(V(G)[x]$weight))))
  index <- which(tmp.sum == max(tmp.sum))

  selected.path = tmp.path[index]
  collect <- unlist(lapply(selected.path, function(x)return(V(G)[x]$name)))

  return(collect)
}


build_network <- function (net, seed, d= 2){

  #genesets.length.null.stat <- structure(list(`1` = c(1.01397367504035, 1.18858228819048), `2` = c(1.61970348041337, 1.30189433386605), `3` = c(2.11767222957028, 1.36222065695878), `4` = c(2.47710421934929, 1.36968129959296), `5` = c(2.776011866622, 1.36318885187196), `6` = c(3.16885126246671, 1.42577861995897)), .Names = c("1", "2", "3", "4", "5", "6"))
  genesets.length.null.stat <- lapply(1:500, function(x) c(runif(1)+x, runif(1)+x))
  names(genesets.length.null.stat) <- 1:500

  score.fun<-function(g){
    Za <- sum(V(g)$weight*V(g)$RWRNodeweight)/sqrt(sum(V(g)$RWRNodeweight^2))
    k <- vcount(g)
    tmp <- genesets.length.null.stat[[as.character(k)]] # genesets.length.null.stat is a list with the median of Za and sd of Za calculated for 1000 replicates of networks of size k
    Sa <- (Za-tmp[1])/tmp[2]
  }

  best.fun <- function(in.nodes,out.nodes)  {
    score<-(-Inf); best<-character()
    for (node in out.nodes){
      subG.update<-induced.subgraph(net, c(in.nodes,node))
      if (score.fun(subG.update) > score) {
        score<-score.fun(subG.update)
        best<-node
      }
    }
    list("node"=best,"score"=score)
  }

  subG <- induced.subgraph(net, seed)
  if (!is.connected(subG)) {          #the seed must be connected
    stop("Input seeds are disjoint")
  }

  while (TRUE) {
    in.nodes <- V(subG)$name
    node_num <- vcount(subG)
    subsum <- score.fun(subG)
    #subx <- V(subG)$name
    for (rad in 1:d) {
      tmp.neigh <- unlist(neighborhood(net, order = rad, nodes = V(subG)$name))
      pot.nodes <- V(net)[tmp.neigh]$name
      out.nodes <- setdiff(pot.nodes, in.nodes)
      if (length(out.nodes) == 0) break

      #message("length in.nodes = ", length(in.nodes))
      #message("length out.nodes = ", length(out.nodes))

      best_node<-best.fun(in.nodes, out.nodes)
      new_score<-best_node$score
      best_node<-best_node$node

      if (new_score > subsum + 0.01) {
        tmp <- unlist(lapply(best_node, function(x) node2treePath(net,V(subG)$name, x))) # node2treePath is a function to retrieve the shortest path with the highest node weights
        in.nodes <- c(tmp, V(subG)$name)
        subG <- induced.subgraph(net, in.nodes)
        break
      }
    }
    if (node_num == vcount(subG)) break
  }
  subG
}

library(igraph)
library(profr)



library(igraph)
library(profr)

#genesets.length.null.stat <- lapply(1:500, function(x) c(runif(1)+x, runif(1)+x))
#names(genesets.length.null.stat) <- 1:500

set.seed(1)
genesets_jack_a = runif(500) + 1:500
genesets_jack_b = runif(500) + 1:500

do_it_jw <- function(n = 1000){

  my_graph <- erdos.renyi.game(n, 0.0003)
  V(my_graph)$name <- 1:vcount(my_graph)
  V(my_graph)$weight <- rnorm(n)
  V(my_graph)$RWRNodeweight <- runif(n, min = 0, max = 0.05)

  ### Run the function
  sublist = list()
  Vmg <- V(my_graph)
  for (node in Vmg$name) {
    #message(node)
    subnet <- build_network_jw(my_graph, node, 2)
    sublist[[node]] <- subnet }
}

do_it <- function(n = 1000){

  my_graph <- erdos.renyi.game(n, 0.0003)
  V(my_graph)$name <- 1:vcount(my_graph)
  V(my_graph)$weight <- rnorm(n)
  V(my_graph)$RWRNodeweight <- runif(n, min = 0, max = 0.05)

  ### Run the function
  sublist = list()
  Vmg <- V(my_graph)
  for (node in Vmg$name) {
    #message(node)
    subnet <- build_network(my_graph, node, 2)
    sublist[[node]] <- subnet }
}

library(microbenchmark)
mb <- microbenchmark(do_it(1000), do_it_jw(1000), times = 5)
print(mb)

optimise algorithm for building a graph based on node weights

3 Answers3