0

I'm trying to create an R Shiny application that takes in a .RIS (bibliographic citation file) as input, finds potential duplicates based on certain fields (like title, authors, year, etc.), allows the user to decide if they are duplicates and choose the better reference to keep, and then merges these duplicates and exports a cleaned .RIS file.

Here's the R code that I'm using to build the app:

# Load necessary libraries
library(shiny) 
library(bibliometrix)
library(DT)
library(dplyr)
library(stringr)
library(stringdist)
library(iotools)
library(RefManageR)

# Removing all HTML tags function
remove_html_tags <- function(x) {
  gsub("<.*?>", "", x)
}

# Function to find duplicates
find_duplicates <- function(df, selected_fields) {
  df %>%
    group_by(across(all_of(selected_fields))) %>%
    filter(n() > 1)
}

# Function to merge duplicates
merge_duplicates <- function(df, selected_ids) {
  df %>%
    group_by(id) %>%
    summarise(across(everything(), ~ first(.x[which.max(Score)]))) %>%
    ungroup() %>%
    slice(-which(id %in% selected_ids))
}


# Function to write RIS file
write_ris <- function(df, file) {
  for(i in 1:nrow(df)) {
    line <- paste0("TY  - ", df[i, "TY"])
    write(line, file, append = TRUE)
    if(!is.na(df[i, "TI"])) {
      line <- paste0("TI  - ", df[i, "TI"])
      write(line, file, append = TRUE)
    }
    if(!is.na(df[i, "AU"])) {
      authors <- strsplit(as.character(df[i, "AU"]), ";")[[1]]
      for(author in authors) {
        line <- paste0("AU  - ", author)
        write(line, file, append = TRUE)
      }
    }
    if(!is.na(df[i, "PY"])) {
      line <- paste0("PY  - ", df[i, "PY"])
      write(line, file, append = TRUE)
    }
    if(!is.na(df[i, "T2"])) {
      line <- paste0("T2  - ", df[i, "T2"])
      write(line, file, append = TRUE)
    }
    if(!is.na(df[i, "DO"])) {
      line <- paste0("DO  - ", df[i, "DO"])
      write(line, file, append = TRUE)
    }
    if(!is.na(df[i, "SP"])) {
      line <- paste0("SP  - ", df[i, "SP"])
      write(line, file, append = TRUE)
    }
    if(!is.na(df[i, "IS"])) {
      line <- paste0("IS  - ", df[i, "IS"])
      write(line, file, append = TRUE)
    }
    if(!is.na(df[i, "VL"])) {
      line <- paste0("VL  - ", df[i, "VL"])
      write(line, file, append = TRUE)
    }
    if(!is.na(df[i, "CN"])) {
      line <- paste0("CN  - ", df[i, "CN"])
      write(line, file, append = TRUE)
    }
    if(!is.na(df[i, "LB"])) {
      line <- paste0("LB  - ", df[i, "LB"])
      write(line, file, append = TRUE)
    }
    if(!is.na(df[i, "AN"])) {
      line <- paste0("AN  - ", df[i, "AN"])
      write(line, file, append = TRUE)
    }
    # End of entry
    line <- "ER  -"
    write(line, file, append = TRUE)
  }
}



# Define the UI
ui <- fluidPage(
  fileInput('file', 'Choose RIS file', accept = c('text/ris', '.ris')),
  checkboxGroupInput("duplicateFields", "Select Fields for Checking Duplicates", choices = c("Author" = "AU", "Year" = "PY", "Title" = "TI", "Pages" = "SP"), selected = "TI"),
  actionButton("start", "Start Duplicates Check"),
  DTOutput('contents'),
  uiOutput("checkboxesUI"),
  actionButton("merge", "Merge Selected References"),
  downloadButton("download", "Download Cleaned RIS file")
)

# Define the server logic
server <- function(input, output, session) {
  
  data <- reactive({
    file <- input$file
    if (is.null(file)) {
      return(NULL)
    }
    
    # Read RIS file
    ris <- readLines(file$datapath)
    
    # Convert to dataframe
    df <- convert2df(ris, dbsource = "isi", format = "plaintext")
    
    print(dim(df))  # REMOVE Check dimensions
    
    
    # Preprocess the title field
    df$TI <- df$TI %>%
      remove_html_tags() %>%
      iconv(to = "ASCII//TRANSLIT") %>%
      str_to_title() %>%
      str_replace_all("Alpha", "alpha") %>%
      str_replace_all("Beta", "beta") %>%
      str_replace_all("Gamma", "gamma")
    
    print(dim(df))  # REMOVE Check dimensions after preprocessing the title field
    
    
    # Handle the abstract field
    df$AB <- ifelse(df$AB == "" | df$AB == "no abstract", "no abstract", "abstract available")
    
    print(dim(df))  # Debug line to print dimensions of the dataframe
    
    
    # Indicate preferable references
    df$Score <- 0
    df$Score[df$AU != ""] <- df$Score[df$AU != ""] + 1
    df$Score[df$PG != ""] <- df$Score[df$PG != ""] + 1
    df$Score[df$SO != ""] <- df$Score[df$SO != ""] + 1
    df$Score[df$TI != ""] <- df$Score[df$TI != ""] + 1
    df$Score[df$AB != ""] <- df$Score[df$AB != ""] + 2
    
    return(df)
  })
  

  # Duplicate check
  observeEvent(input$start, {
    
    showModal(modalDialog(
      title = "Please wait",
      "Checking for duplicates...",
      easyClose = FALSE
    ))
    
    df <- data()
    
    selected_fields <- input$duplicateFields
    
    duplicates <- find_duplicates(df, selected_fields)
    
    output$contents <- DT::renderDataTable({
      duplicates
    })
    
    removeModal()
    
    output$checkboxesUI <- renderUI({
      checkboxGroupInput('selected_ids', 'Select Duplicates to Merge:', choices = duplicates$id)
    })
  })
  
  # Merge duplicates
  observeEvent(input$merge, {
    
    showModal(modalDialog(
      title = "Please wait",
      "Merging selected references...",
      easyClose = FALSE
    ))
    
    df <- data()
    
    selected_ids <- input$selected_ids
    
    df_clean <- merge_duplicates(df, selected_ids)
    
    output$contents <- DT::renderDataTable({
      df_clean
    })
    
    removeModal()
    
    data <<- df_clean
  })
  
  # Download data
  output$download <- downloadHandler(
    filename = function() {
      paste("cleaned_data-", Sys.Date(), ".ris", sep="")
    },
    content = function(file) {
      write_ris(data(), file)
    }
  )
}

# Run the application
shinyApp(ui = ui, server = server)

The issue I'm facing is that the application crashes after clicking the "Start Duplicate Check" button with the error message in the console:

Warning: Error in data.frame: arguments imply differing number of rows: 0, 1
1: runApp

In terms of debugging, I've tried checking the dimensions of the data frame at various points in the code using print(dim(df)), but everything seems to be in order until the application crashes.

I'm also including a sample RIS file that I've been using to test the application: text

Does anyone have any suggestions on how to debug this issue? Also, if you have any suggestions for improving the design or efficiency of the code, I would appreciate that as well. Thanks so much!

Edit: Making r2evans updates (Thanks!!) the console gives:

Called from: observe()
Browse[1]> c
Called from: `<reactive:data>`(...)
Browse[1]> c

Converting your isi collection into a bibliographic dataframe

Warning: Error in : 'TY  - JOUR' does not exist in current working directory ('C:/Users/north/Documents/_GDIT/R').
  1: runApp

'TY - JOUR' seems to be interpreted as a path, which shouldn't be the case. In the RIS file format, 'TY - JOUR' is used to denote that a given entry is a journal article, it's not a file or a directory.

May be the convert2df function that seems to be triggering the issue? Should I use something different?

    # Convert to dataframe
    df <- convert2df(ris, dbsource = "isi", format = "plaintext")
Mouse B.
  • 1
  • 1
  • Related: https://stackoverflow.com/q/26147558/3358272. – r2evans Jun 29 '23 at 13:39
  • As for how to debug: isolate exactly which expression is causing the error. This might mean you need to add `browser()` to each and every reactive component. Once you've narrowed it down, look at the arguments you're sending to that function to see what looks amiss. I don't have some of your libraries so I cannot test all of it myself. – r2evans Jun 29 '23 at 13:41
  • Side notes: (1) Doing `write(...)` once for each line is going to be slower than necessary, consider capturing all of your needed writes into a vector (appending as you go) then `write(..)` _once_. (2) `ifelse(df$AB == "" | df$AB == "no abstract", ...)` is more succinct (and a _touch_ safer) with `ifelse(df$AB %in% c("", "no abstract"), ...)`. (3) I've yet to see a place where having nested `observe`/`reactive` as necessary and not confusing. It might work, might work some times, etc. (4) `if (is.null(file)) { return(NULL); }` can be replaced completely with `req(fill)`, more safety-checks too – r2evans Jun 29 '23 at 13:44
  • 1
    Ok! I appreciate the side notes!! I added them in to the best of my ability and the browser() bits as well. Not really spotting what's wrong, but I'll dig some more before I swing back. – Mouse B. Jun 29 '23 at 14:05

0 Answers0