Here is an approach targeting your optimization-problem (and ignoring your permutation-based approach).
I formulate the problem as mixed-integer-problem and use specialized solvers to calculate good solutions.
As your problem is not well-formulated, it might need some modifications. But the general message is: this approach will be hard to beat!.
Code
import numpy as np
from cvxpy import *
""" Parameters """
N_POPULATION = 50
GROUPSIZES = [3, 6, 12, 12, 17]
assert sum(GROUPSIZES) == N_POPULATION
N_GROUPS = len(GROUPSIZES)
OBJ_FACTORS = [0.4, 0.1, 0.15, 0.35] # age is the most important
""" Create fake data """
age_vector = np.clip(np.random.normal(loc=35.0, scale=10.0, size=N_POPULATION).astype(int), 0, np.inf)
height_vector = np.clip(np.random.normal(loc=180.0, scale=15.0, size=N_POPULATION).astype(int), 0, np.inf)
weight_vector = np.clip(np.random.normal(loc=85, scale=20, size=N_POPULATION).astype(int), 0, np.inf)
skill_vector = np.random.randint(0, 100, N_POPULATION)
""" Calculate a-priori stats """
age_mean, height_mean, weight_mean, skill_mean = np.mean(age_vector), np.mean(height_vector), \
np.mean(weight_vector), np.mean(skill_vector)
""" Build optimization-model """
# Variables
X = Bool(N_POPULATION, N_GROUPS) # 1 if part of group
D = Variable(4, N_GROUPS) # aux-var for deviation-norm
# Constraints
constraints = []
# (1) each person is exactly in one group
for p in range(N_POPULATION):
constraints.append(sum_entries(X[p, :]) == 1)
# (2) each group has exactly n (a-priori known) members
for g_ind, g_size in enumerate(GROUPSIZES):
constraints.append(sum_entries(X[:, g_ind]) == g_size)
# Objective: minimize deviation from global-statistics within each group
# (ugly code; could be improved a lot!)
group_deviations = [[], [], [], []] # age, height, weight, skill
for g_ind, g_size in enumerate(GROUPSIZES):
group_deviations[0].append((sum_entries(mul_elemwise(age_vector, X[:, g_ind])) / g_size) - age_mean)
group_deviations[1].append((sum_entries(mul_elemwise(height_vector, X[:, g_ind])) / g_size) - height_mean)
group_deviations[2].append((sum_entries(mul_elemwise(weight_vector, X[:, g_ind])) / g_size) - weight_mean)
group_deviations[3].append((sum_entries(mul_elemwise(skill_vector, X[:, g_ind])) / g_size) - skill_mean)
for i in range(4):
for g in range(N_GROUPS):
constraints.append(D[i,g] >= abs(group_deviations[i][g]))
obj_parts = [sum_entries(OBJ_FACTORS[i] * D[i, :]) for i in range(4)]
objective = Minimize(sum(obj_parts))
""" Build optimization-problem & solve """
problem = Problem(objective, constraints)
problem.solve(solver=GUROBI, verbose=True, TimeLimit=120) # might need to use non-commercial solver here
print('Min-objective: ', problem.value)
""" Evaluate solution """
filled_groups = [[] for g in range(N_GROUPS)]
for g_ind, g_size in enumerate(GROUPSIZES):
for p in range(N_POPULATION):
if np.isclose(X[p, g_ind].value, 1.0):
filled_groups[g_ind].append(p)
for g_ind, g_size in enumerate(GROUPSIZES):
print('Group: ', g_ind, ' of size: ', g_size)
print(' ' + str(filled_groups[g_ind]))
group_stats = []
for g in range(N_GROUPS):
age_mean_in_group = age_vector[filled_groups[g]].mean()
height_mean_in_group = height_vector[filled_groups[g]].mean()
weight_mean_in_group = weight_vector[filled_groups[g]].mean()
skill_mean_in_group = skill_vector[filled_groups[g]].mean()
group_stats.append((age_mean_in_group, height_mean_in_group, weight_mean_in_group, skill_mean_in_group))
print('group-assignment solution means: ')
for g in range(N_GROUPS):
print(np.round(group_stats[g], 1))
""" Compare with input """
input_data = np.vstack((age_vector, height_vector, weight_vector, skill_vector))
print('input-means')
print(age_mean, height_mean, weight_mean, skill_mean)
print('input-data')
print(input_data)
Output (time-limit of 2 minutes; commercial solver)
Time limit reached
Best objective 9.612058823514e-01, best bound 4.784117647059e-01, gap 50.2280%
('Min-objective: ', 0.961205882351435)
('Group: ', 0, ' of size: ', 3)
[16, 20, 27]
('Group: ', 1, ' of size: ', 6)
[26, 32, 34, 45, 47, 49]
('Group: ', 2, ' of size: ', 12)
[0, 6, 10, 12, 15, 21, 24, 30, 38, 42, 43, 48]
('Group: ', 3, ' of size: ', 12)
[2, 3, 13, 17, 19, 22, 23, 25, 31, 36, 37, 40]
('Group: ', 4, ' of size: ', 17)
[1, 4, 5, 7, 8, 9, 11, 14, 18, 28, 29, 33, 35, 39, 41, 44, 46]
group-assignment solution means:
[ 33.3 179.3 83.7 49. ]
[ 33.8 178.2 84.3 49.2]
[ 33.9 178.7 83.8 49.1]
[ 33.8 179.1 84.1 49.2]
[ 34. 179.6 84.7 49. ]
input-means
(33.859999999999999, 179.06, 84.239999999999995, 49.100000000000001)
input-data
[[ 22. 35. 28. 32. 41. 26. 25. 37. 32. 26. 36. 36.
27. 34. 38. 38. 38. 47. 35. 35. 34. 30. 38. 34.
31. 21. 25. 28. 22. 40. 30. 18. 32. 46. 38. 38.
49. 20. 53. 32. 49. 44. 44. 42. 29. 39. 21. 36.
29. 33.]
[ 161. 158. 177. 195. 197. 206. 169. 182. 182. 198. 165. 185.
171. 175. 176. 176. 172. 196. 186. 172. 184. 198. 172. 162.
171. 175. 178. 182. 163. 176. 192. 182. 187. 161. 158. 191.
182. 164. 178. 174. 197. 156. 176. 196. 170. 197. 192. 171.
191. 178.]
[ 85. 103. 99. 93. 71. 109. 63. 87. 60. 94. 48. 122.
56. 84. 69. 162. 104. 71. 92. 97. 101. 66. 58. 69.
88. 69. 80. 46. 74. 61. 25. 74. 59. 69. 112. 82.
104. 62. 98. 84. 129. 71. 98. 107. 111. 117. 81. 74.
110. 64.]
[ 81. 67. 49. 74. 65. 93. 25. 7. 99. 34. 37. 1.
25. 1. 96. 36. 39. 41. 33. 28. 17. 95. 11. 80.
27. 78. 97. 91. 77. 88. 29. 54. 16. 67. 26. 13.
31. 57. 84. 3. 87. 7. 99. 35. 12. 44. 71. 43.
16. 69.]]
Solution remarks
- This solution looks quite nice (regarding mean-deviation) and it only took 2 minutes (we decided on the time-limit a-priori)
- We also got tight bounds:
0.961 is our solution; we know it can't be lower than 4.784
Reproducibility
- The code uses numpy and cvxpy
- An commercial solver was used
- You might need to use a non-commercial MIP-solver (supporting time-limit for early abortion; take current best-solution)
- The valid open-source MIP-solvers supported in cvxpy are: cbc (no chance of setting time-limits for now) and glpk (check the docs for time-limit support)
Model decisions
- The code uses L1-norm penalization, which results in an MIP-problem
- Depending on your problem, it might be wise to use L2-norm penalization (one big deviation hurts more than many smaller ones), which will result in a harder problem (MIQP / MISOCP)