2

I am using the referenceIntervals package in R, to do some data analytics.

In particular I am using the refLimit function which calculates reference and confidence intervals. I want to edit it to remove certain functionality (for instance it runs a shapiro normalitiy test, which stops the entire code if the data larger than 5000, it wont allow you to parametrically test samples less than 120). To do this I have been typing refLimit into the terminal - copying the function definition, then saving it as a separate file (below is the full original definition of the function).

 singleRefLimit = 
function (data, dname = "default", out.method = "horn", out.rm = FALSE, 
    RI = "p", CI = "p", refConf = 0.95, limitConf = 0.9) 
{
    if (out.method == "dixon") {
        output = dixon.outliers(data)
    }
    else if (out.method == "cook") {
        output = cook.outliers(data)
    }
    else if (out.method == "vanderLoo") {
        output = vanderLoo.outliers(data)
    }
    else {
        output = horn.outliers(data)
    }
    if (out.rm == TRUE) {
        data = output$subset
    }
    outliers = output$outliers
    n = length(data)
    mean = mean(data, na.rm = TRUE)
    sd = sd(data, na.rm = TRUE)
    norm = NULL
    if (RI == "n") {
        methodRI = "Reference Interval calculated nonparametrically"
        data = sort(data)
        holder = nonparRI(data, indices = 1:length(data), refConf)
        lowerRefLimit = holder[1]
        upperRefLimit = holder[2]
        if (CI == "p") {
            CI = "n"
        }
    }
    if (RI == "r") {
        methodRI = "Reference Interval calculated using Robust algorithm"
        holder = robust(data, 1:length(data), refConf)
        lowerRefLimit = holder[1]
        upperRefLimit = holder[2]
        CI = "boot"
    }
    if (RI == "p") {
        methodRI = "Reference Interval calculated parametrically"
        methodCI = "Confidence Intervals calculated parametrically"
        refZ = qnorm(1 - ((1 - refConf)/2))
        limitZ = qnorm(1 - ((1 - limitConf)/2))
        lowerRefLimit = mean - refZ * sd
        upperRefLimit = mean + refZ * sd
        se = sqrt(((sd^2)/n) + (((refZ^2) * (sd^2))/(2 * n)))
        lowerRefLowLimit = lowerRefLimit - limitZ * se
        lowerRefUpperLimit = lowerRefLimit + limitZ * se
        upperRefLowLimit = upperRefLimit - limitZ * se
        upperRefUpperLimit = upperRefLimit + limitZ * se
        shap_normalcy = shapiro.test(data)
        shap_output = paste(c("Shapiro-Wilk: W = ", format(shap_normalcy$statistic, 
            digits = 6), ", p-value = ", format(shap_normalcy$p.value, 
            digits = 6)), collapse = "")
        ks_normalcy = suppressWarnings(ks.test(data, "pnorm", 
            m = mean, sd = sd))
        ks_output = paste(c("Kolmorgorov-Smirnov: D = ", format(ks_normalcy$statistic, 
            digits = 6), ", p-value = ", format(ks_normalcy$p.value, 
            digits = 6)), collapse = "")
        if (shap_normalcy$p.value < 0.05 | ks_normalcy$p.value < 
            0.05) {
            norm = list(shap_output, ks_output)
        }
        else {
            norm = list(shap_output, ks_output)
        }
    }
    if (CI == "n") {
        if (n < 120) {
            cat("\nSample size too small for non-parametric confidence intervals, \n    \t\tbootstrapping instead\n")
            CI = "boot"
        }
        else {
            methodCI = "Confidence Intervals calculated nonparametrically"
            ranks = nonparRanks[which(nonparRanks$SampleSize == 
                n), ]
            lowerRefLowLimit = data[ranks$Lower]
            lowerRefUpperLimit = data[ranks$Upper]
            upperRefLowLimit = data[(n + 1) - ranks$Upper]
            upperRefUpperLimit = data[(n + 1) - ranks$Lower]
        }
    }
    if (CI == "boot" & (RI == "n" | RI == "r")) {
        methodCI = "Confidence Intervals calculated by bootstrapping, R = 5000"
        if (RI == "n") {
            bootresult = boot::boot(data = data, statistic = nonparRI, 
                refConf = refConf, R = 5000)
        }
        if (RI == "r") {
            bootresult = boot::boot(data = data, statistic = robust, 
                refConf = refConf, R = 5000)
        }
        bootresultlower = boot::boot.ci(bootresult, conf = limitConf, 
            type = "basic", index = 1)
        bootresultupper = boot::boot.ci(bootresult, conf = limitConf, 
            type = "basic", index = 2)
        lowerRefLowLimit = bootresultlower$basic[4]
        lowerRefUpperLimit = bootresultlower$basic[5]
        upperRefLowLimit = bootresultupper$basic[4]
        upperRefUpperLimit = bootresultupper$basic[5]
    }
    RVAL = list(size = n, dname = dname, out.method = out.method, 
        out.rm = out.rm, outliers = outliers, methodRI = methodRI, 
        methodCI = methodCI, norm = norm, refConf = refConf, 
        limitConf = limitConf, Ref_Int = c(lowerRefLimit = lowerRefLimit, 
            upperRefLimit = upperRefLimit), Conf_Int = c(lowerRefLowLimit = lowerRefLowLimit, 
            lowerRefUpperLimit = lowerRefUpperLimit, upperRefLowLimit = upperRefLowLimit, 
            upperRefUpperLimit = upperRefUpperLimit))
    class(RVAL) = "interval"
    return(RVAL)
}

However when I then execute this file a large number of terms end up being undefined, for instance when I use the function I get 'object 'nonparRanks' not found'.

How do I edit the function in the package? I have looked at trying to important the package namespace and environment but this has not helped. I have also tried to find the actual function in the package files in my directory, but not been able to.

I am reasonably experienced in R, but I have never had to edit a package before. I am clearly missing something about how functions are defined in packages, but I am not sure what.

jps
  • 20,041
  • 15
  • 75
  • 79
Azaz
  • 23
  • 3
  • I think this is a duplicate of one of the few questions I've asked here (some years ago): https://stackoverflow.com/questions/12178830/change-internal-function-of-a-package – Roland Jun 19 '18 at 06:06

2 Answers2

1

In the beginning of the package there is a line

data(sysdata, envir=environment())

See here: https://github.com/cran/referenceIntervals/tree/master/data/sysdata.rda

I suspect that "nonparRanks" is defined there as I don't see it defined anywhere else. So perhaps you could download that file, write your own function, then run that same line before running your function and it may work.

EDIT: Download the file then run:

load("C:/sysdata.rda")

With your path to the file and then your function will work.

AidanGawronski
  • 2,055
  • 1
  • 14
  • 24
  • Excellent. This has solved it, thanks. There is probably some deeper stuff going on about environments that I do not understand, but this has loaded in everything I need. – Azaz Jun 19 '18 at 02:42
  • Great, if it's helpful you can accept this [answer](https://stackoverflow.com/help/accepted-answer). I should mention that the dataframe that gets loaded from sysdata.rda does not have entries for when the samplesize is below 119 (Which I assume relates to the part of your question "it wont allow you to parametrically test samples less than 120"). So that may still cause you problems. – AidanGawronski Jun 19 '18 at 02:47
1

nonparRanks is a function in the referenceIntervals package:

Table that dictate the ranks for the confidence intervals around thecalculated reference interval

Your method of saving and editing the function is fine, but make sure you load all the necessary underlying functions to run it too.

The easiest thing to do might be to:

  1. save your copied and pasted R function as a different name, e.g. singleRefLimit2, then
  2. call library("referenceIntervals"), which will load all the underlying functions you need and then
  3. load your function source("singelRefLimit2.R"), with whatever edits you choose to make.
Scransom
  • 3,175
  • 3
  • 31
  • 51
  • Thanks for the answer. This is roughly what I am trying to do, but even if I load the library, nonParRanks still comes up as undefined? It seems like R is only storing it internally when doing function calculations, because it is not accessible at other times. – Azaz Jun 19 '18 at 01:01
  • This won't work because the function is trying to reference objects which are in the environment of the referenceIntervals package (rather than the global). – AidanGawronski Jun 19 '18 at 01:01
  • try loading it directly then either via the same method, or using the rda file in @AidanGawronski's answer – Scransom Jun 19 '18 at 01:02