1

I am a newbee in the field of GNN and want to use PyTorch Geometric (PyG) to train a Graph Neural Network (GNN) to predict links (edges) between nodes in a graph using an autoencoder (with a modified version of the PyG link prediction example with two SAGEConv layers (I used this tutorial). I would like to add a new node to the graph and predict which existing nodes have the highest probability of having an edge with the new node. Specifically, I have the following questions:

  1. How can I add a new node (feature tensor) to an existing graph that has no edges?
  2. How can I predict the nodes with the highest probability of having an edge with the new node?

The reason I chose the SAGEConv layers is that (if I understood it correctly) I can predict links to nodes that have not been present during training because of the inductive capabilities of GraphSage.

So far I defined and trained following model and training function:

class Net(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout=0.1):
        super().__init__()
        self.dropout = dropout
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)
        

    def encode(self, x, edge_index):
        #x = self.conv1(x, edge_index).relu()
        #return self.conv2(x, edge_index)
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout)

        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout)

        return x
    
    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(
            dim=-1
        )  # product of a pair of nodes on each edge

    def decode_all(self, z):
        prob_adj = z @ z.t()
        return (prob_adj > 0).nonzero(as_tuple=False).t()

def train_link_predictor(
    model, train_data, val_data, optimizer, criterion, n_epochs=100
):

    for epoch in range(1, n_epochs + 1):

        model.train()
        optimizer.zero_grad()
        
        # create node embeddings (aggregating neighbor nodes with GraphSage)
        z = model.forward(train_data.x, train_data.edge_index)
        
        # sampling training negatives for every training epoch
        neg_edge_index = negative_sampling(
            edge_index=train_data.edge_index, num_nodes=train_data.num_nodes,
            num_neg_samples=train_data.edge_label_index.size(1), method='sparse')

        edge_label_index = torch.cat(
            [train_data.edge_label_index, neg_edge_index],
            dim=-1,
        )
        
        # edge labels contain 1 for positive edges and 0 for negative edges
        edge_label = torch.cat([
            train_data.edge_label,
            train_data.edge_label.new_zeros(neg_edge_index.size(1))
        ], dim=0)
        
        # the decoder makes a prediction based on the node embeddings by calculating pairwise dot-product 
        out = model.decode(z, edge_label_index).view(-1)
        
        # the loss is calculated by minimizing the difference between predictions and labeled values for pos/neg edges
        loss = criterion(out, edge_label)        
        loss.backward()
        optimizer.step()

        val_auc = eval_link_predictor(model, val_data)
        writer.add_scalar("Loss/train", loss, epoch)
        writer.add_scalar("AUC/train", val_auc, epoch)
        if epoch % 10 == 0:
            print(f"Epoch: {epoch:03d}, Train Loss: {loss:.3f}, Val AUC: {val_auc:.3f}")
            

    return model
    

I thought about adding a new node feature (tensor) to list of the nodes (data.x) and add an entry with no edge to the adjacency matrix (data.edge_index) but have no idea if it is the best and useful solution.

feinheitsbrei
  • 51
  • 2
  • 5

0 Answers0