32

Given a set of real numbers drawn from a unknown continuous univariate distribution (let's say is is one of beta, Cauchy, chi-square, exponential, F, gamma, Laplace, log-normal, normal, Pareto, Student's t, uniform and Weibull) ..

x <- c(7.7495976,12.1007857,5.8663491,9.9137894,11.3822335,7.4406175,8.6997212,9.4456074,11.8370711,6.4251469,9.3597039,8.7625700,10.3171063,8.0983110,11.7564283,11.7583461,7.3760516,14.5713098,14.3289690,12.8436795,7.1834376,12.2530520,8.9362235,11.8964391,5.4378782,7.8083060,0.1356370,14.9341847,6.8625143,9.0285873,10.2251998,10.3348486,7.7518365,2.8757024,9.2676577,10.6879259,11.7623207,14.0745924,9.3478318,7.6788852,9.7491924,14.9409955,11.0297640,8.5541261,8.6129808,9.2192320,12.3507414,8.9156903,11.6892831,10.2571897,11.1673235,10.5883741,8.2396129,7.3505839,3.4437525,8.3660082,10.5779227,8.5382177,13.6647484,9.0712034,4.1090454,13.4238382,16.1965937,14.2539891,14.6498816,6.9662381,12.3282141,10.9628268,10.8859495,11.6742822,12.0469869,9.1764119,4.2324549,12.6665295,10.7467579,6.4153703,10.3090806,12.0267082,9.2375369,13.8011813,13.0457227,14.0147179,6.9224316,7.1164269,10.7577799,8.0965571,13.3371566,14.6997535,8.8248384,8.0634834,10.2226001,8.5112199,8.1701147,8.1970784,10.5432878,5.9603389,6.6287037,13.3417943,3.1122822,10.4241008,11.4281520,9.4647825,10.5480176,14.2357819,9.4220778,9.7012755,10.9251006,5.3073151,10.8228672,12.0936384,8.5146227,8.4115865,7.7244591,7.2801474,7.3412563,4.5385940,7.8822841,12.7327836,11.5509252,13.0300876,10.0458138,11.3862972,11.3644867,12.6585391,5.8567192,9.8764841,7.6447620,8.7806429,9.2089114,9.1961781,7.2400724,14.7575303,8.6874476,4.6276043,14.0592724,10.3519708,8.2222625,8.7710501,8.5724602,11.4279232,9.6734741,12.1972490,10.1250074,4.8571327,8.0019245,9.8036286,17.7386541,10.8935339,4.7258581,14.2681556,7.4236474,9.4520797,9.2066764,7.7805317,0.4938756,13.0306624,8.0225287,11.1801478,8.7481126,16.5873192,6.0404763,9.5674318,10.8915023,13.2473727,5.5877557,1.4474869,10.9504070,10.8879749,10.7765684,9.1501230,11.0798794,10.0961631,9.5913525,14.0855129,7.3918195,16.6303158,9.1436327,11.9848346,11.4691572,16.0934172,13.1431040,8.2455786,10.7388841,13.7107201,9.6223990,7.6363513,9.5731838,7.0150930,14.1341888,7.5834625,13.8362695,12.9790060,10.4156690,6.4108920,6.3731019,6.3302824,8.4924571,11.2175143,11.6346609,6.0958761,12.8728176,10.2689647,9.7923411,11.3962741,7.3723701,8.1169299,9.7926014,8.7266379,10.7350973,12.7639103,7.4425159,15.9422109,9.9073852,6.2421614,5.2925668,9.9822059,13.9768971,9.3481404,6.8102106,12.6482884,9.8595946,12.8946675,6.3519119,9.2698768,4.9538608,13.8062408,14.7438135,8.5583994,12.4232260,9.4205371,13.6507205,11.7807767,10.9747222,15.9299602,10.0202244,11.9209419,12.8159324,7.0107459,7.8076222,8.0086965,14.7694984,6.4810687,6.6833260,3.9660939,16.2414479,9.3474497,10.2626126,11.7672786,10.1245905,2.3416774,9.2548226,12.3498943,9.1731074,8.6703280,3.8079927,12.0858349,11.1027140,11.9034505,11.1981903,9.5554276,11.5333311,4.1374535,7.9397446,10.6732513,5.4928081,5.9026714,7.1902350,7.3516027,9.5251792,12.8827838,8.6051567,9.9074448,4.7244414,9.4681156,17.4316786,15.0770196,7.4215510,7.2839984,8.2040354,11.2938556,12.2308244,17.2933409,5.7154747,9.9383524,7.9912142,10.2087560,13.0489301,10.2092634,11.4029668,10.3103281,10.2810316,8.9487624,14.2699307,12.8538251,10.7545354,18.0638133,7.2115769,7.4020585,7.9737234,13.1687588,13.7186238,9.6881618,4.2991770,11.4829896,8.0113006,10.0285544,8.3325591,8.8476239,9.3618137,11.0913308,10.2702207,12.0215701,11.8083744,8.1575837,10.0413629,11.7291752,13.8315537,12.4823312,13.3289096,8.5874403,9.8624401,7.0444818,13.9701389,10.0250634,14.3841966,17.4074390,13.1290358,8.3764673,7.8796107,6.4597773,12.4989708,11.3617236,5.0730931,13.5990536,9.4800716,11.1247161,12.6283343,12.5711367,10.8075848,13.2183856,12.4566869,17.0046899,9.9132293,13.8912393,10.4806343,6.7550983,18.4982020,4.6835563,4.6068688,8.4304188,7.8747286,9.4440702,12.1033704,10.7397568,12.4483258,12.0952273,9.4609549,16.1755646,13.2110564,12.5244792,14.5511670,14.9365263,6.6852081,14.6988321,9.8833093,11.1549852,14.4090081,6.2565184,8.3488705,10.8509966,7.6795679,13.5814813,10.1733942,12.1773482,4.7032686,9.9248308,17.7067155,8.2378404,12.8208154,12.7675305,9.0907063,9.5720411,4.5536981,5.2252539,10.7393508,8.1761239,7.8011878,10.8517959,12.8793471,10.1738281,9.0522516,9.7020267,8.5743543,7.1063673,9.4366173,7.5154902,9.2420952,13.7275687,8.2097051,12.4686117,8.6426135,10.6854081,14.8617929,14.2631291,11.1449327,8.4807248,5.9399190,6.7772300,7.2566033,10.3215210,9.2483564,10.8592844,13.8227188,5.8955118,6.8936159,11.4641992,8.6535466,14.1301887,10.2194653,9.3929177,11.8592296,9.3153675,10.8574024,9.5293558,14.1394531,7.1224090,5.6785198,13.1351723,7.1031658,7.6344684,8.6918016,6.8426780,8.6902514,9.9025967,6.1603559,6.3995948,6.7157089,14.9359341,13.1275476,11.2493476,10.7684760,8.5263731,5.1711855,10.2432689,6.7908688,9.2634794,5.6242460,7.7319788,13.7579540,10.5344149,11.2123002,9.5503450,11.3042249,6.6581916,13.0363709,9.0141363,6.8815546,8.6309000,9.4825677,6.9816465,9.4836443,8.5629547,12.5643187,13.2918150,4.9542483,3.8941388,12.0723769,14.6818075,6.2067566,8.6538934,11.4860264,9.6481396,12.7096758,7.8361298,12.0167492,9.2011051,6.7472607,13.5725275,15.0862343,12.5248807,10.8804527,12.7291198,7.7527975,7.8537703,10.5257599,11.2615216,5.2586963,9.3935784,4.8959811,14.9649019,9.7550081,9.0961317,3.0822901,10.4690830,11.4116176,11.8268286,9.6303294,12.6595176,10.3003485,10.6738841,7.1545388,13.1700952,8.8394611,11.7666496,5.3739818,12.5156287,10.5998309,7.9280247,11.3985509,9.3435626,9.1445783,7.5190392,10.5207065,5.5194295,14.4021779,7.9815022,7.3148241,5.0131517,12.1867856,3.4892615,14.7278153,10.0177503,9.0080577,6.2549383,11.5792232,10.0743671,4.6603495,9.1943305,10.0549778,13.3946923,11.0435648,11.9903902,7.5212459,6.9752799,9.7793759,3.0074422,9.9630136,8.2949444,14.4448033,8.8767257,10.4919437,12.8309614,11.9987884,9.4450733,7.1909711,7.7836130,12.0111407,7.8110426,8.8857522,7.2070115,6.1091037,15.5397454,12.4138856,11.0948175,10.3384724,4.0731303,11.9523302,11.7543732,8.6845056,11.3963952,9.1248950,9.8663549,14.4536098,10.5610537,9.6523570,9.9533877,10.1019772,12.0909679,12.1466894,9.8986813,14.2406526,10.1251599,13.5607593,8.3409267,7.3538062,9.2187909,8.3878572,9.6934979,6.8270478,6.9754722,14.7438670,6.2118150,4.3408116,11.4874280,12.9580969,9.5487183,10.2743684,11.2433385,14.4445854,10.3395096,5.7534609,10.5550234,10.9322053,10.2105928,11.3020951,12.9484069,6.5904212,8.4368601,11.3280691,8.6031823,7.6938566,11.3733151,12.3900593,11.7711757,11.2307516,13.4915701,10.7228153,7.3886924,8.4401787,10.2753493,8.4389663,12.1972728,10.4918743,10.6289742,10.5594228,6.7236908,11.2358099,8.5938861,12.3906280,14.4511787,7.4746119,15.8803774,2.5522927,9.6801286,8.5697501,10.8271935,13.5280438,10.6818935,13.5646711,3.5187030,10.4440143,9.8327296,9.7382627,14.1669606,6.9083257,3.8266181,13.6244062,11.0284378,9.5523319,8.9891586,9.9055215,8.3856238,8.7478998,6.6987620,14.7248918,9.2529918,10.2082195,4.9534370,9.2030317,5.2269606,8.0661516,13.1779369,5.2971835,15.0037013,7.2702621,6.9997505,9.6490126,13.9149660,10.7425870,9.7558964,12.5752855,10.5098261,20.2689637,9.8681830,7.8259004,9.4911900,9.6024895,7.6085691,12.0086596,6.6780724,8.2764670,8.9880572,15.9231426,5.9905542,13.5816388,8.9839322,9.5235545,10.1314783,13.1174616,8.1648447,12.5653484,12.4941364,10.5916275,12.7761500,9.8608664,8.1374522,10.6055768,6.5465219,11.7945966,7.0397647,4.4046833,12.4284773,0.4180241,12.0268339,10.0441325,5.3276329,8.4208769,8.5484829,9.8222639,9.4951750,9.3263556,13.7433301,10.1112279,12.3558939,10.8694158,9.7864777,5.5161601,7.0906274,14.5786803,12.9236138,8.9206195,7.0104273,5.8283839,7.6944516,6.2924265,10.0766522,10.3576597,8.5793193,11.2022858,4.9360148,6.5907700,13.0853471,9.5498965,10.8132248,7.3545704,9.3583861,10.5726301,6.8032692,9.5914570,6.1383186,7.0176580,16.8026498,6.7959168,9.2745414,7.7390857,12.5977623,8.6116698,13.6735060,10.8476068,9.6710713,10.1086791,9.6101003,11.2849373,14.3841286,10.0175111,5.9766042,9.2654916,12.3336237,11.0695365,9.4801954,6.6405542,11.7110714,9.2962742,4.5557592,7.9725970,10.3105591,9.1068024,8.1585631,14.9021906,9.2015137,15.0472571,9.1225965,13.9551835,15.1033478,10.6360240,12.0867865,15.6969704,9.5818060,8.1641150,8.2950194,8.6544478,7.9130456,8.8904450,13.9381998,8.9913977,14.0155779,6.2856039,10.7923301,8.8070441,11.2657258,10.7901363,9.1724396,6.6433443,9.5172255,12.3402514,2.7254577,12.4006210,13.2697124,10.0670987,15.3858112,8.2044828,10.7534955,7.9282064,10.9170642,12.8222748,18.2680638,9.0601854,13.2616197,7.0193571,12.2447467,5.3729936,14.8064727,10.5359554,10.4851627,11.8312380,13.3435483,10.5894537,5.0047413,7.5532502,11.9171854,12.1777692,7.6730359,5.5515027,12.3027227,10.1575062,14.8505769,9.6526219,11.2016182,10.7898901,13.6303578,12.8561220,13.3002161,9.0945849,4.9117132,8.0514791,8.3684288,4.7461608,6.3118847,14.3888758,15.8801467,11.6563489,7.9043481,6.1992280,10.4055679,6.4948166,11.8656277,3.8399970,9.5901581,8.6379262,7.4541442,7.1135626,7.9164363,9.6439593,15.6259631,7.3244170,8.4635798,12.0317526,17.1847365,12.5357554,6.0369018,12.9830581,11.2712555,12.3488084,9.3935706,8.1248854,11.4523131,9.6710694,9.5978474,15.1563587,7.5582530,10.8587757,13.5890062,10.1390991,8.1443215,16.1032757,6.5988579,9.6915113,7.6946942,10.5688193,7.9222074,6.0964578,7.0383112,11.5956154,6.6059072,13.5679685,15.1021379,10.2625096,10.2202339,15.7814051,16.3342713,6.1339245,0.9275113,15.8169582,11.0888355,7.8822788,15.2039942,9.6944328,11.7292036,11.6230714,8.4657438,7.6462181,7.1888162,8.1788400,13.7221572,12.4793501,10.4488461,8.9233659,8.9305724,7.4913262,12.5882791,10.6825315,10.8527571,12.1660301,12.4390247,13.8529219,8.5372836,11.2575812,6.4922496,9.5404721,10.7082122,11.2365487,10.2713802,14.8685632,10.7735798,10.6526134,4.8455022,8.3135583,10.8120056,7.2903999,7.0497880,4.9958942,5.9730174,9.8642732,11.5609671,10.1178216,6.6279774,9.2441754,9.9419299,13.4710469,6.0601435,8.2095239,7.9456672,12.7039825,7.4197810,9.5928275,8.2267352,2.8314614,11.5653497,6.0828073,11.3926117,10.5403929,14.9751607,11.7647580,8.2867261,10.0291522,7.7132033,6.3337642,14.6066222,11.3436587,11.2717791,10.8818323,8.0320657,6.7354041,9.1871676,13.4381778,7.4353197,8.9210043,10.2010750,11.9442048,11.0081195,4.3369520,13.2562675,15.9945674,8.7528248,14.4948086,14.3577443,6.7438382,9.1434984,15.4599419,13.1424011,7.0481925,7.4823108,10.5743730,6.4166006,11.8225244,8.9388744,10.3698150,10.3965596,13.5226492,16.0069239,6.1139247,11.0838351,9.1659242,7.9896031,10.7282936,14.2666492,13.6478802,10.6248561,15.3834373,11.5096033,14.5806570,10.7648690,5.3407430,7.7535042,7.1942866,9.8867927,12.7413156,10.8127809,8.1726772,8.3965665)

.. is there some easy way in R to programmatically and automatically find the most likely distribution and the estimated distribution parameters?

Please note that the distribution identification code will be part of an automated process, so manual intervention in the identification won't be possible.

Ben Bolker
  • 211,554
  • 25
  • 370
  • 453
knorv
  • 49,059
  • 74
  • 210
  • 294

6 Answers6

17

My first approach would be to generate qq plots of the given data against the possible distributions.

x <- c(15.771062,14.741310,9.081269,11.276436,11.534672,17.980860,13.550017,13.853336,11.262280,11.049087,14.752701,4.481159,11.680758,11.451909,10.001488,11.106817,7.999088,10.591574,8.141551,12.401899,11.215275,13.358770,8.388508,11.875838,3.137448,8.675275,17.381322,12.362328,10.987731,7.600881,14.360674,5.443649,16.024247,11.247233,9.549301,9.709091,13.642511,10.892652,11.760685,11.717966,11.373979,10.543105,10.230631,9.918293,10.565087,8.891209,10.021141,9.152660,10.384917,8.739189,5.554605,8.575793,12.016232,10.862214,4.938752,14.046626,5.279255,11.907347,8.621476,7.933702,10.799049,8.567466,9.914821,7.483575,11.098477,8.033768,10.954300,8.031797,14.288100,9.813787,5.883826,7.829455,9.462013,9.176897,10.153627,4.922607,6.818439,9.480758,8.166601,12.017158,13.279630,14.464876,13.319124,12.331335,3.194438,9.866487,11.337083,8.958164,8.241395,4.289313,5.508243,4.737891,7.577698,9.626720,16.558392,10.309173,11.740863,8.761573,7.099866,10.032640)
> qqnorm(x)

For more info see link

Another possibility is based on the fitdistr function in the MASS package. Here is the different distributions ordered by their log-likelihood

> library(MASS)
> fitdistr(x, 't')$loglik
[1] -252.2659
Warning message:
In log(s) : NaNs produced
> fitdistr(x, 'normal')$loglik
[1] -252.2968
> fitdistr(x, 'logistic')$loglik
[1] -252.2996
> fitdistr(x, 'weibull')$loglik
[1] -252.3507
> fitdistr(x, 'gamma')$loglik
[1] -255.9099
> fitdistr(x, 'lognormal')$loglik
[1] -260.6328
> fitdistr(x, 'exponential')$loglik
[1] -331.8191
Warning messages:
1: In dgamma(x, shape, scale, log) : NaNs produced
2: In dgamma(x, shape, scale, log) : NaNs produced
midtiby
  • 14,550
  • 6
  • 34
  • 43
  • You just beat me to it! That is exactly what I would do. A loop in order to find the min logLik. Beware! The OP has changed the values so your results aren't correct anymore. Using another software I actually found out that the best fit is given by an inverse normal distribution with parameters mu=9.976 and lambda = 42.411. Does fitdistr accept such a distribution? – gd047 Apr 18 '10 at 08:28
  • gd047: The number of observations was increased from 100 to 1000, but the underlying distribution is still the same. – knorv Apr 18 '10 at 08:35
  • 10
    From the statistics point of view this code is wrong. In general distributions with many parameters perform better log-likelyhood score then distributions with few parameters. But this fact does not means that many parameters distribution hypothesis must be accepted as a better hypothesis against the few parameters distributions. loglik can be compared only when the numbers of parameters estimated are the same in number. – emanuele Jul 12 '12 at 09:26
  • 3
    As emanuele says, this is incorrect. You're doing model selection: you need a model selection criterion. Likelihood is not appropriate – Ben Allison Mar 19 '13 at 11:46
  • I agree with emanuele and I think the point is important enough that I downvoted the answer. An Akaike information criterion ([which is coded in R](https://stat.ethz.ch/R-manual/R-patched/library/stats/html/AIC.html)), BIC or cross-validation usually make more sense than just using the likelihood. Without adjusting for degrees of freedom (or number of parameters), you might be fitting a lot on the noise of the data. – cd98 Sep 03 '15 at 14:05
16

Another similar approach is using the fitdistrplus package

library(fitdistrplus)

Loop through the distributions of interest and generate 'fitdist' objects. Use either "mle" for maximum likelihood estimation or "mme" for matching moment estimation, as the fitting method.

f1<-fitdist(x,"norm",method="mle")

Use bootstrap re-sampling in order to simulate uncertainty in the parameters of the selected model

b_best<-bootdist(f_best)
print(f_best)
plot(f_best)
summary(f_best)

The fitdist method allows for using custom distributions or distributions from other packages, provided that the corresponding density function dname, the corresponding distribution function pname and the corresponding quantile function qname have been defined (or even just the density function).

So if you wanted to test the log-likelihood for the inverse normal distribution:

library(ig)
fitdist(x,"igt",method="mle",start=list(mu=mean(x),lambda=1))$loglik

You may also find Fitting distributions with R helpful.

gd047
  • 29,749
  • 18
  • 107
  • 146
9

(Answer edited to add additional explanation)

  1. You can't really find "the" distribution; the actual distribution from which data are drawn can nearly always* be guaranteed not to be in any "laundry list" provided by any such software. At best you can find "a" distribution (more likely several), one that is an adequate description. Even if you find a great fit there are always an infinity of distributions that are arbitrarily close by. Real data tends to be drawn from heterogeneous mixtures of distributions that themselves don't necessarily have simple functional form.

    * an example where you might hope to is where you know the data were actually generated from exactly one distribution on a list, but such situations are extremely rare.

  2. I don't think just comparing likelihoods is necessarily going to make sense, since some distributions have more parameters than others. AIC might make more sense, except that ...

  3. Attempting to identify a "best fitting" distribution from a list of candidates will tend to produce overfitting, and unless the effect of such model selection is accounted for properly will lead to overconfidence (a model that looks great but doesn't actually fit the data not in your sample). There are such possibilities in R (the package fitdistrplus comes to mind), but as a common practice I would advise against the idea. If you must do it, use holdout samples or cross-validation to obtain models with better generalization error.

Community
  • 1
  • 1
Glen_b
  • 7,883
  • 2
  • 37
  • 48
  • 2
    Are you suggesting that extra parameters may allow one to "over-fit" the data, and thus achieve an undesireably low loglik? – unutbu May 02 '10 at 14:17
  • @unutbu Not necessarily over-fit; but more parameters means more degrees of freedom to fit the data. – Glen_b Aug 25 '12 at 04:08
  • Overfitting is precisely what happens, yes. You need actual model selection here, which penalises models with greater number of parameters. – Ben Allison Mar 19 '13 at 11:44
  • 1
    @BenAllison Not necessarily; just because it will prefer the biggest models doesn't *automatically* mean that the biggest model in the set of candidates is overfitted. If I generate data from a 3-parameter gamma (shifted gamma) and I choose between an exponential and a 2-parameter gamma, I'll always prefer the two-parameter gamma, yet in won't be 'overfitted'. Overfitting will happen when the set of candidates includes models that have more than sufficient parameters to describe the distribution, though. – Glen_b Mar 19 '13 at 12:07
  • 1
    Yes sorry, I didn't mean to imply that overfitting is guaranteed, rather that the additional parameters allow for the possibility of overfitting. I think the most degenerate example of this is mixture models: the model with the highest likelihood is one that has one component per data point. This is rarely a useful model though, and so we can use model selection techniques to mitigate against the potential to overfit increased expressivity in more complex models. – Ben Allison Mar 19 '13 at 13:27
8

I find it hard to imagine a realistic situation where this would be useful. Why not use a non-parametric tool like a kernel density estimate?

hadley
  • 102,019
  • 32
  • 183
  • 245
  • 3
    I had the same view for years, until very recently. I finally have an answer for that: in cases where it's not possible to share the original data, it is useful to have a very simple way of generating synthetic data that looks a lot like the original data. For instance, I want to optimize an algorithm that may be affected by cacheing and I want to test on a massive amount of synthetic data under different scenarios, as well as understand the behavior analytically. It feels dirty to use synthetic data, but I see some utility. – Iterator Aug 29 '11 at 16:13
3

You could try using the Kolmogorov-Smirnov tests (ks.test in R).

If you have time-to-event data, here's software that does a Bayesian chi squared test against a list of common distributions to report the best fit.

John D. Cook
  • 29,517
  • 10
  • 67
  • 94
1

As others have pointed out, this might be framed as a model selection question. It is a wrong approach to use the distribution that fits the data best without taking into account the complexity of the distribution. This is because the more complicated distribution will generally have better fit, but it will likely overfit the data.

You can use the Akaike Information Criteria (AIC) to take into account the complexity of the distribution. This is still unsatisfactory as you're only considering a limited number of distributions, but is still better than just using the log likelihood.

I use just a few distributions, but you can check the documentation to find others that could be relevant

Using the fitdistrplus you can run:

library(fitdistrplus)

distributions = c("norm", "lnorm", "exp",
          "cauchy", "gamma", "logis",
          "weibull")


# the x vector is defined as in the question

# Plot to see which distributions make sense. This should influence
# your choice of candidate distributions
descdist(x, discrete = FALSE, boot = 500)

distr_aic = list()
distr_fit = list()
for (distribution in distributions) {
    distr_fit[[distribution]] = fitdist(x, distribution)
    distr_aic[[distribution]] = distr_fit[[distribution]]$aic
}

> distr_aic
$norm
[1] 5032.269

$lnorm
[1] 5421.815

$exp
[1] 6602.334

$cauchy
[1] 5382.643

$gamma
[1] 5184.17

$logis
[1] 5047.796

$weibull
[1] 5058.336

According to our plot and the AIC, it makes sense to use a normal. You can automatize this by just picking the distribution with the minimum AIC. You can check the estimated parameters with

> distr_fit[['norm']]
Fitting of the distribution ' norm ' by maximum likelihood 
Parameters:
     estimate Std. Error
mean 9.975849 0.09454476
sd   2.989768 0.06685321
cd98
  • 3,442
  • 2
  • 35
  • 51