I'm trying to create an R Shiny application that takes in a .RIS (bibliographic citation file) as input, finds potential duplicates based on certain fields (like title, authors, year, etc.), allows the user to decide if they are duplicates and choose the better reference to keep, and then merges these duplicates and exports a cleaned .RIS file.
Here's the R code that I'm using to build the app:
# Load necessary libraries
library(shiny)
library(bibliometrix)
library(DT)
library(dplyr)
library(stringr)
library(stringdist)
library(iotools)
library(RefManageR)
# Removing all HTML tags function
remove_html_tags <- function(x) {
gsub("<.*?>", "", x)
}
# Function to find duplicates
find_duplicates <- function(df, selected_fields) {
df %>%
group_by(across(all_of(selected_fields))) %>%
filter(n() > 1)
}
# Function to merge duplicates
merge_duplicates <- function(df, selected_ids) {
df %>%
group_by(id) %>%
summarise(across(everything(), ~ first(.x[which.max(Score)]))) %>%
ungroup() %>%
slice(-which(id %in% selected_ids))
}
# Function to write RIS file
write_ris <- function(df, file) {
for(i in 1:nrow(df)) {
line <- paste0("TY - ", df[i, "TY"])
write(line, file, append = TRUE)
if(!is.na(df[i, "TI"])) {
line <- paste0("TI - ", df[i, "TI"])
write(line, file, append = TRUE)
}
if(!is.na(df[i, "AU"])) {
authors <- strsplit(as.character(df[i, "AU"]), ";")[[1]]
for(author in authors) {
line <- paste0("AU - ", author)
write(line, file, append = TRUE)
}
}
if(!is.na(df[i, "PY"])) {
line <- paste0("PY - ", df[i, "PY"])
write(line, file, append = TRUE)
}
if(!is.na(df[i, "T2"])) {
line <- paste0("T2 - ", df[i, "T2"])
write(line, file, append = TRUE)
}
if(!is.na(df[i, "DO"])) {
line <- paste0("DO - ", df[i, "DO"])
write(line, file, append = TRUE)
}
if(!is.na(df[i, "SP"])) {
line <- paste0("SP - ", df[i, "SP"])
write(line, file, append = TRUE)
}
if(!is.na(df[i, "IS"])) {
line <- paste0("IS - ", df[i, "IS"])
write(line, file, append = TRUE)
}
if(!is.na(df[i, "VL"])) {
line <- paste0("VL - ", df[i, "VL"])
write(line, file, append = TRUE)
}
if(!is.na(df[i, "CN"])) {
line <- paste0("CN - ", df[i, "CN"])
write(line, file, append = TRUE)
}
if(!is.na(df[i, "LB"])) {
line <- paste0("LB - ", df[i, "LB"])
write(line, file, append = TRUE)
}
if(!is.na(df[i, "AN"])) {
line <- paste0("AN - ", df[i, "AN"])
write(line, file, append = TRUE)
}
# End of entry
line <- "ER -"
write(line, file, append = TRUE)
}
}
# Define the UI
ui <- fluidPage(
fileInput('file', 'Choose RIS file', accept = c('text/ris', '.ris')),
checkboxGroupInput("duplicateFields", "Select Fields for Checking Duplicates", choices = c("Author" = "AU", "Year" = "PY", "Title" = "TI", "Pages" = "SP"), selected = "TI"),
actionButton("start", "Start Duplicates Check"),
DTOutput('contents'),
uiOutput("checkboxesUI"),
actionButton("merge", "Merge Selected References"),
downloadButton("download", "Download Cleaned RIS file")
)
# Define the server logic
server <- function(input, output, session) {
data <- reactive({
file <- input$file
if (is.null(file)) {
return(NULL)
}
# Read RIS file
ris <- readLines(file$datapath)
# Convert to dataframe
df <- convert2df(ris, dbsource = "isi", format = "plaintext")
print(dim(df)) # REMOVE Check dimensions
# Preprocess the title field
df$TI <- df$TI %>%
remove_html_tags() %>%
iconv(to = "ASCII//TRANSLIT") %>%
str_to_title() %>%
str_replace_all("Alpha", "alpha") %>%
str_replace_all("Beta", "beta") %>%
str_replace_all("Gamma", "gamma")
print(dim(df)) # REMOVE Check dimensions after preprocessing the title field
# Handle the abstract field
df$AB <- ifelse(df$AB == "" | df$AB == "no abstract", "no abstract", "abstract available")
print(dim(df)) # Debug line to print dimensions of the dataframe
# Indicate preferable references
df$Score <- 0
df$Score[df$AU != ""] <- df$Score[df$AU != ""] + 1
df$Score[df$PG != ""] <- df$Score[df$PG != ""] + 1
df$Score[df$SO != ""] <- df$Score[df$SO != ""] + 1
df$Score[df$TI != ""] <- df$Score[df$TI != ""] + 1
df$Score[df$AB != ""] <- df$Score[df$AB != ""] + 2
return(df)
})
# Duplicate check
observeEvent(input$start, {
showModal(modalDialog(
title = "Please wait",
"Checking for duplicates...",
easyClose = FALSE
))
df <- data()
selected_fields <- input$duplicateFields
duplicates <- find_duplicates(df, selected_fields)
output$contents <- DT::renderDataTable({
duplicates
})
removeModal()
output$checkboxesUI <- renderUI({
checkboxGroupInput('selected_ids', 'Select Duplicates to Merge:', choices = duplicates$id)
})
})
# Merge duplicates
observeEvent(input$merge, {
showModal(modalDialog(
title = "Please wait",
"Merging selected references...",
easyClose = FALSE
))
df <- data()
selected_ids <- input$selected_ids
df_clean <- merge_duplicates(df, selected_ids)
output$contents <- DT::renderDataTable({
df_clean
})
removeModal()
data <<- df_clean
})
# Download data
output$download <- downloadHandler(
filename = function() {
paste("cleaned_data-", Sys.Date(), ".ris", sep="")
},
content = function(file) {
write_ris(data(), file)
}
)
}
# Run the application
shinyApp(ui = ui, server = server)
The issue I'm facing is that the application crashes after clicking the "Start Duplicate Check" button with the error message in the console:
Warning: Error in data.frame: arguments imply differing number of rows: 0, 1
1: runApp
In terms of debugging, I've tried checking the dimensions of the data frame at various points in the code using print(dim(df))
, but everything seems to be in order until the application crashes.
I'm also including a sample RIS file that I've been using to test the application: text
Does anyone have any suggestions on how to debug this issue? Also, if you have any suggestions for improving the design or efficiency of the code, I would appreciate that as well. Thanks so much!
Edit: Making r2evans updates (Thanks!!) the console gives:
Called from: observe()
Browse[1]> c
Called from: `<reactive:data>`(...)
Browse[1]> c
Converting your isi collection into a bibliographic dataframe
Warning: Error in : 'TY - JOUR' does not exist in current working directory ('C:/Users/north/Documents/_GDIT/R').
1: runApp
'TY - JOUR' seems to be interpreted as a path, which shouldn't be the case. In the RIS file format, 'TY - JOUR' is used to denote that a given entry is a journal article, it's not a file or a directory.
May be the convert2df function that seems to be triggering the issue? Should I use something different?
# Convert to dataframe
df <- convert2df(ris, dbsource = "isi", format = "plaintext")