Introduction

This notebook describes the code necessary to recreate the findings published in the journal article. For more background and details, please see the github repository, where the description also contains a link to the journal article.

Load the data

First, we load both the potential biomarker compounds, as well as the reference set

Potential biomarkers

# Load the subgraph
subgraph = read.csv2("Adjacent connections -- Transmog 31-10-2016.csv", header = F, stringsAsFactors = F)
colnames(subgraph) = c("Name", "ID", "PredicateID", "SemanticTypeIDs", "nSources", "Sources")

# Load the potential biomarker compounds
candidates = read.csv2("Indirect_Transmog_all 31-10-2016.csv", stringsAsFactors = F)

# Consistency check
candidates = candidates[which(candidates$Connected_to_ID %in% subgraph$ID),]

Reference set

selected = read.csv("Definite_CSF_compounds_migraine_node_IDs_16-01-2015.csv", stringsAsFactors = F, header = T, sep = ";")

selected_serum = read.csv("Definite_serum_migraine_node_ID_16-01-2015_unique.csv", stringsAsFactors = F, header = F, sep = ";")
colnames(selected_serum) = c("Compound", "UMLS", "ID", "Lit_count", "pref_scheme", "all_scheme")

all = merge(selected, selected_serum, by = "ID", all = T)

all[is.na(all)] = 0
all$total = all$Selected + all$Lit_count

Apply the filters

Next, we filter the data for highly generic concepts, as well as for pharmacological preparations. For this and the next steps, we use the concept identifiers instead of the concept names. The list of generic concepts is shown below. The list of exogenic drugs is too long, and therefore loaded as a separate file.

General = '{
"Disease" : 14188,
"Chronic Disease" : 10188,
"Rare Diseases" : 908408,
"Body Tissue" : 43826,
"Genes" : 19190,
"Molecular targets" : 2242208,
"Related compounds" : 825890,
"Receptor" : 820372,
"Enzymes" : 16098,
"Vitamins" : 46376,
"Toxin" : 44094,
"Amino acids" : 3622,
"Hormones" : 21946,
"Antigens" : 4550,
"Proteins" : 36588,
"Cytokines" : 103770,
"Pharmaceutical Preparations" : 14706,
"Primary Disease" : 308818,
"Syndrome" : 42644,
"Ions" : 24216,
"Elements" : 15436,
"Steroids" : 41756, 
"Cells" : 8996, 
"DNA" : 14362}'

# Create a pretty data frame
general_concepts = t(as.data.frame(fromJSON(General)))
filtergroup = data.frame(Name = rownames(general_concepts), ID = general_concepts[,1])

# Remove these concepts from both the subgraph, the intermediate concepts in the candidates data frame, and the potential biomarkers
subgraph = subgraph[-which(subgraph$ID %in% filtergroup$ID),]
candidates = candidates[-which(candidates$Connected_to_ID %in% filtergroup$ID),]
candidates = candidates[-which(candidates$ID %in% filtergroup$ID),]

Now we remove the drugs with a pre-created list (creation code available upon request).

# Load the data and remove it from the subgraph, the intermediate concepts in the candidates data frame, and the potential biomarkers
filterstuff = read.csv2("pharma_not_wanted_2.csv", header = F)
candidates = candidates[-which(candidates$ID %in% filterstuff$V1), ]
candidates = candidates[-which(candidates$Connected_to_ID %in% filterstuff$V1), ]
subgraph = subgraph[-which(subgraph$ID %in% filterstuff$V1), ]

Calculate the ranking statistics

The code below is used to calculate the three ranking statistics.

# Calculate all the ranking statistics
s = unique(subgraph[ , c("Name", "ID", "SemanticTypeIDs")])

c = unique(candidates[, c("Name", "ID", "SemanticTypeIDs")])
intermediate_table = unique(candidates[, c("Connected_to_ID", "Name", "ID", "SemanticTypeIDs")])

nConcepts = aggregate(intermediate_table$Connected_to_ID, by = list(ID = intermediate_table$ID), FUN = length)
colnames(nConcepts) = c("ID", "nConcepts")

## Calculate the ranking statistics
# Subgraph
nSources_total = sum(subgraph$nSources)
nRelationships_total = nrow(subgraph)

s$Edge_Chance_AB = 1/nrow(s)
s$Rel_Chance_AB = 0
s$Source_Chance_AB = 0

for(i in 1:length(s$ID)){
  subgroup = subgraph[which(subgraph$ID == s$ID[i]), ]
  s$Rel_Chance_AB[i] = nrow(subgroup) / nRelationships_total 
  s$Source_Chance_AB[i] = sum(subgroup$nSources) / nSources_total
}

ConIDs = unique(candidates$Connected_to_ID)
tempTab = data.frame()
for(i in 1:length(ConIDs)){
  subgroup = candidates[which(candidates$Connected_to_ID == ConIDs[i]),]  
  IdFreq = as.data.frame(table(ID = subgroup$ID))
  IdFreq$RelChance_BC = IdFreq$Freq / sum(IdFreq$Freq)
  IdFreq$EdgeChance_BC = 1/nrow(subgroup)
  IdSourceSums = aggregate(subgroup$nSources, by=list(ID = subgroup$ID), FUN=sum)
  IdSourceSums$SourceChance_BC = IdSourceSums$x / sum(subgroup$nSources)  
  IdFreq = merge(IdFreq[,-2], IdSourceSums[,-2], by = "ID")
  IdFreq$Connected_to_ID = ConIDs[i]
  tempTab = rbind(tempTab, IdFreq)
}

# Merge the subgraph statistics with the intermediate table
combined_table = merge(tempTab, s[,c("ID","Edge_Chance_AB","Rel_Chance_AB","Source_Chance_AB")], by.x = "Connected_to_ID", by.y = "ID", all.x = T)

## Calculate the cumulative chances
combined_table$RelChance_total = combined_table$Rel_Chance_AB * combined_table$RelChance_BC
combined_table$EdgeChance_total = combined_table$Edge_Chance_AB * combined_table$EdgeChance_BC
combined_table$SourceChance_total = combined_table$Source_Chance_AB * combined_table$SourceChance_BC

# Sum up the path-chances
agg_RelChance = aggregate(combined_table$RelChance_total, by = list(ID = combined_table$ID), FUN = sum)
colnames(agg_RelChance) = c("ID", "agg_rel")
agg_EdgeChance = aggregate(combined_table$EdgeChance_total, by = list(ID = combined_table$ID), FUN = sum)
colnames(agg_EdgeChance) = c("ID", "agg_edge")
agg_SourceChance = aggregate(combined_table$SourceChance_total, by = list(ID = combined_table$ID), FUN = sum)
colnames(agg_SourceChance) = c("ID", "agg_source")

# Create a combined outcome table
c$ID = as.factor(c$ID)
outcome_table = merge(c, agg_RelChance, by = "ID")
outcome_table = merge(outcome_table, agg_EdgeChance, by = "ID")
outcome_table = merge(outcome_table, agg_SourceChance, by = "ID")
outcome_table = merge(outcome_table, nConcepts, by = "ID")
outcome_table$score = 0

Evaluation

Here, we evaluate the performance of our method. First, the scores are assigned to the potential biomarker compounds.

# Assign scores to the compounds
for(i in 1:nrow(all)) {
  if(all$ID[i] %in% c$ID){
    outcome_table[which(outcome_table$ID == all$ID[i]), "score"] = all$total[i]
  }}

Next, we sort the list based on the most straightforward ranking mechanism; the number of intermediate concepts. As a tie-breaker, a secondary filtering based on the edge probability is performed. Please note that all scores will be included in the sheet below. If you want all the data written away, the output variable can be written as csv file. The command to achieve this with is commented out.

outcome_table_nCon = outcome_table[order(outcome_table$nConcepts, outcome_table$agg_edge, decreasing = T),]
outcome_table_Edge = outcome_table[order(outcome_table$agg_edge, outcome_table$nConcepts, decreasing = T),]
outcome_table_Source = outcome_table[order(outcome_table$agg_source, outcome_table$agg_rel, decreasing = T),]
outcome_table_Rel = outcome_table[order(outcome_table$agg_rel, outcome_table$agg_source, decreasing = T),]

output = data.frame(Rel = outcome_table_Rel$score, Edge = outcome_table_Edge$score, Source = outcome_table_Source$score, Concept = outcome_table_nCon$score)
#write.csv2(output, "Compounds ranked and scored.csv", row.names = F)
results = data.frame(k = seq(from = 100, to = 2000, by = 100))

for(r in 1:nrow(results)){
  results$noweight_Edges[r] = length(which(output$Edge[1:results$k[r]] > 0))
  results$noweight_Edges_avg[r] = round(results$k[r] / length(which(output$Edge[1:results$k[r]] > 0)), 2)
  results$sum_weight_Edges[r] = sum(output$Edge[1:results$k[r]])
  results$weight_Edges[r] = round(sum(output$Edge[1:results$k[r]]) / results$k[r], 2)
  results$noweight_Rels[r] = length(which(output$Rel[1:results$k[r]] > 0))
  results$noweight_Rels_avg[r] = round(results$k[r] / length(which(output$Rel[1:results$k[r]] > 0)), 2)
  results$sum_weight_Rels[r] = sum(output$Rel[1:results$k[r]])
  results$weight_Rels[r] = round(sum(output$Rel[1:results$k[r]]) / results$k[r], 2)
  results$noweight_Source[r] = length(which(output$Source[1:results$k[r]] > 0))
  results$noweight_Source_avg[r] = round(results$k[r] / length(which(output$Source[1:results$k[r]] > 0)), 2)
  results$sum_weight_Source[r] = sum(output$Source[1:results$k[r]])
  results$weight_Source_avg[r] = round(sum(output$Source[1:results$k[r]]) / results$k[r], 2)
  results$noweight_Concept[r] = length(which(output$Concept[1:results$k[r]] > 0))
  results$noweight_Concept_avg[r] = round(results$k[r] / length(which(output$Concept[1:results$k[r]] > 0)), 2)
  results$sum_weight_Concept[r] = sum(output$Concept[1:results$k[r]])
  results$weight_Concept_avg[r] = round(sum(output$Concept[1:results$k[r]]) / results$k[r], 2)
}

Below are the results for the top 2000 results, when ranked by the number of intermediate concepts.

k Number of concepts retrieved Average gain Number of points retrieved Average weighted Gain
100 41 2.44 195 1.95
200 69 2.90 321 1.60
300 86 3.49 369 1.23
400 103 3.88 416 1.04
500 109 4.59 441 0.88
600 116 5.17 453 0.76
700 127 5.51 473 0.68
800 133 6.02 482 0.60
900 139 6.47 497 0.55
1000 142 7.04 506 0.51
1100 146 7.53 511 0.46
1200 149 8.05 515 0.43
1300 152 8.55 522 0.40
1400 154 9.09 525 0.38
1500 154 9.74 525 0.35
1600 155 10.32 526 0.33
1700 157 10.83 535 0.31
1800 159 11.32 539 0.30
1900 163 11.66 547 0.29
2000 163 12.27 547 0.27

Because some of the compounds are weighted, I had to develop a custom ROC-curve function, which is available below.

in_data = outcome_table_nCon

indices = which(in_data$score > 0)

weights = in_data$score[indices]

plotstuff = as.data.frame(matrix(nrow = nrow(in_data), ncol = 3))
colnames(plotstuff) = c("TPR", "weight_TPR", "FPR")

N = nrow(outcome_table_nCon) - length(indices)
for(i in 1:nrow(in_data)){
  TP = length(which(in_data$score[1:i] > 0))
  plotstuff[i, "TPR"] = TP / length(indices) #TPR = TP/P
  plotstuff[i, "weight_TPR"] = sum(in_data$score[1:i]) / sum(in_data$score)
  plotstuff[i, "FPR"] = (i - TP) / (N) #FPR = FP/N
}

area = c()
area_weight = c()
for(j in 1:nrow(plotstuff)){
  addition = plotstuff$TPR[j-1] * (plotstuff$FPR[j] - plotstuff$FPR[j-1])
  area = append(area, addition)
  addition_weight = plotstuff$weight_TPR[j-1] * (plotstuff$FPR[j] - plotstuff$FPR[j-1])
  area_weight = append(area_weight, addition_weight)
}
sum(area)
## [1] 0.9560409
sum(area_weight)
## [1] 0.9743335
ggplot(data = plotstuff, aes(x = FPR, y = TPR, color = "Unweighted performance")) +
  geom_line(linetype="longdash", size = 2) +
  geom_line(data = plotstuff, aes(x = FPR, y = weight_TPR, color = "Weighted performance"), size = 2) +
  labs(x="False Positive Rate",y="True Positive Rate") +
  theme(axis.title = element_text(size=15), legend.position = c(0.75, 0.08),  legend.title=element_blank(), legend.text=element_text(size=10), axis.text = element_text(size=15)) +
  scale_color_manual(values=c("#6E6E6E", "#190707")) +
  geom_abline(slope = 1, color = "grey", size = 1.5)

We achieve an auc of 0.956 for the unweighted plot, and an auc of 0.974 for the weighted plot

Finally, a violin graph is created as a type of advanced number line, to show whether there is a difference between the compounds that also have a direct relationship to migraine to the ones that do not.

# Identify the compounds that also have a direct relationship with migraine
direct = all$ID[which(all$ID %in% subgraph$ID)]
Con_direct = which(outcome_table_nCon$ID %in% direct)

# Obtain the data
rowcount = 2000
d = outcome_table_nCon[1:rowcount, ]
d$index = 1:rowcount
d = d[which(d$score > 0), ]
d$direct = 0
d$direct[which(d$ID %in% direct)] = d$score[which(d$ID %in% direct)]
d$indirect = abs(d$direct - d$score)
d$list = "Indirect"
d$list[which(d$direct > 0)] = "Direct"

indices_direct = d$index[which(d$direct > 0)]
indices_indirect = d$index[which(d$indirect > 0)]

d$hist_direct = 0
d$hist_indirect = 0
for(i in 1:nrow(d)){
  val = d$index[i]
  if(d$direct[i] > 0){
    hist_data = sum(indices_direct < val + 10 & indices_direct > val - 10)
    d[i, "hist_direct"] = hist_data}
  else{
    hist_data = sum(indices_indirect < val + 10 & indices_indirect > val - 10)
    d[i, "hist_indirect"] = hist_data}
    d[i, "jitter"] = rnorm(1, mean = 0, sd = hist_data/6)
}
d$hist_direct = d$hist_direct * 2.5
d$hist_indirect = d$hist_indirect * 2.5
d$hist_direct = d$hist_direct - (5 * 2.5)
d$hist_indirect = d$hist_indirect - (3 * 2.5)

# Plot the figure
ggplot(data = d, aes(hist_direct, index, fill = list)) + 
  geom_violin(alpha = 0.1) +
  geom_violin(aes(hist_indirect), alpha = 0.1) + 
  geom_point(aes(jitter, index, colour = list, size = score, alpha = 0.75), show.legend = F)  + 
  coord_flip() + 
  geom_point(aes(x=-.2, y = mean(d$index)), colour = "red", size = 3, shape = 17) +
  geom_point(aes(x=.2, y = (nrow(outcome_table) / 100)), colour = "blue", size = 3, shape = 18) +
  scale_x_continuous(breaks=seq(-10,10,5)) +
  theme(legend.position = "none", axis.title = element_text(size=18), axis.text = element_text(size=15)) +  
  ylim(0,2000) + ylab("Rank") +xlab("Density of reference compounds") + xlim(-12, 12)