Here's a way to do it with pymc. This method uses a fixed number of components (n_components) in the mixture model. You could try attaching a prior to n_components and sampling over that prior. Alternatively, you could just pick something reasonable or use the grid search technique from my other answer to estimate the number of components. In the below code I used 10000 iterations with a burn in of 9000, but that might not be sufficient to get good results. I would suggest using a much larger burn in. I also chose the priors somewhat arbitrarily, so those might be something to look at. You'll have to experiment with it. Best of luck to you. Here is the code.
import numpy as np
import pymc as mc
import scipy.stats as stats
from matplotlib import pyplot
def generate_random_histogram():
# Random bin locations between 0 and 100
bin_locations = np.random.rand(10,) * 100
bin_locations.sort()
# Random counts on those locations
bin_counts = np.random.randint(50, size=len(bin_locations))
return {'loc': bin_locations, 'count':bin_counts}
def bin_widths(loc):
widths = []
for i in range(len(loc)-1):
widths.append(loc[i+1] - loc[i])
widths.append(widths[-1])
widths = np.array(widths)
return widths
def mixer(name, weights, value=None):
n = value.shape[0]
def logp(value, weights):
vals = np.zeros(shape=(n, weights.shape[1]), dtype=int)
vals[:, value.astype(int)] = 1
return mc.multinomial_like(x = vals, n=1, p=weights)
def random(weights):
return np.argmax(np.random.multinomial(n=1, pvals=weights[0,:], size=n), 1)
result = mc.Stochastic(logp = logp,
doc = 'A kernel smoothing density node.',
name = name,
parents = {'weights': weights},
random = random,
trace = True,
value = value,
dtype = int,
observed = False,
cache_depth = 2,
plot = False,
verbose = 0)
return result
def create_model(lowers, uppers, count, n_components):
n = np.sum(count)
lower = min(lowers)
upper = min(uppers)
locations = mc.Uniform(name='locations', lower=lower, upper=upper, value=np.random.uniform(lower, upper, size=n_components), observed=False)
precisions = mc.Gamma(name='precisions', alpha=1, beta=1, value=.001*np.ones(n_components), observed=False)
weights = mc.CompletedDirichlet('weights', mc.Dirichlet(name='weights_ind', theta=np.ones(n_components)))
choice = mixer('choice', weights, value=np.ones(n).astype(int))
@mc.observed
def histogram_data(value=count, locations=locations, precisions=precisions, weights=weights):
if hasattr(weights, 'value'):
weights = weights.value
lower_cdfs = sum([weights[0,i]*stats.norm.cdf(lowers, loc=locations[i], scale=np.sqrt(1.0/precisions[i])) for i in range(len(weights))])
upper_cdfs = sum([weights[0,i]*stats.norm.cdf(uppers, loc=locations[i], scale=np.sqrt(1.0/precisions[i])) for i in range(len(weights))])
bin_probs = upper_cdfs - lower_cdfs
bin_probs = np.array(list(upper_cdfs - lower_cdfs) + [1.0 - np.sum(bin_probs)])
n = np.sum(count)
return mc.multinomial_like(x=np.array(list(count) + [0]), n=n, p=bin_probs)
@mc.deterministic
def location(locations=locations, choice=choice):
return locations[choice.astype(int)]
@mc.deterministic
def dispersion(precisions=precisions, choice=choice):
return precisions[choice.astype(int)]
data_generator = mc.Normal('data', mu=location, tau=dispersion)
return locals()
# Generate the histogram
hist = generate_random_histogram()
loc = hist['loc']
count = hist['count']
widths = bin_widths(hist['loc'])
lowers = loc - widths
uppers = loc + widths
# Create the model
model = create_model(lowers, uppers, count, 5)
# Initialize to the MAP estimate
model = mc.MAP(model)
model.fit(method ='fmin')
# Now sample with MCMC
model = mc.MCMC(model)
model.sample(iter=10000, burn=9000, thin=300)
# Plot the mu and tau traces
mc.Matplot.plot(model.trace('locations'))
pyplot.show()
# Get the samples from the fitted pdf
sample = np.ravel(model.trace('data')[:])
# Plot the original histogram, sampled histogram, and pdf
lower = min(lowers)
upper = min(uppers)
kde = stats.gaussian_kde(sample)
x = np.arange(0,100,.1)
y = kde(x)
fig = pyplot.figure()
ax1 = fig.add_subplot(311)
pyplot.xlim(lower,upper)
ax1.bar(loc, count, width=widths)
ax2 = fig.add_subplot(312, sharex=ax1)
ax2.hist(sample, bins=loc)
ax3 = fig.add_subplot(313, sharex=ax1)
ax3.plot(x, y)
pyplot.show()
And as you can see, the two distributions don't look terribly alike. However, a histogram is not much to go off of. I would play with different numbers of components and more iterations / burn in, but it's a project. Depending on your priorities, I suspect either @askewchan's or my other answer might serve you better.
