TDaltonC · CalvinLeather · Feb 28, 2015 · Feb 28, 2015 · Mar 10, 2015 · Mar 10, 2015
diff --git a/MVPA/stimulusSelection.py b/MVPA/stimulusSelection.py
@@ -1,61 +1,75 @@
 # -*- coding: utf-8 -*-
 """
 GA based stimulus selector
-Creates a population of wowasaurus rex
-3 chromosomes, each with 10 'proteins'
-Permutation based CX and mutation (preserves uniqueness)
-
+ @
 Created on Sat Feb 21 13:10:50 2015
 @author: Calvin Leather
+
+To do-
+Crossover algorithm is somehow producing a few repeats
+Consider single-item/hetero 'buddy' term in fitness function
+Stick with 2pop ks test or use JS divergence? Or its square?
+
 """
-#%% Imorts
+
+#%% Imports
 import numpy as np
 import pandas as pd
 from deap import base, creator, tools
-import random, operator
-from sklearn.neighbors.kde import KernelDensity
 import matplotlib.pyplot as plt
-from scipy.stats import mstats
+from scipy.stats import kstest, ks_2samp
 from scoop import futures
+import random, operator, seaborn
+
 
 print 'defining variables'
 
 # Three col CSV (Item-Code, Option-Type, Value)
-csv_filepath='/home/brain/Desktop/Custom MVPA Workflow/SampleData.csv'
+csv_filepath='/home/brain/Desktop/Custom MVPA Workflow/options.csv'
 
 #%% Magic Numbers
-population_size=20
-
-# SALT
+nepochs, ngen, npop, cxpb, mutpb = 4,60,250, 0.4, 0.2
+HOFsize=1 #number of individuals to put in each epoc
+HallOfFame=[]
 
-n_single=20 #1
+SID='a03'
+n_single=20 #1 
 n_hetero=15 #2
 n_homo=22 #3
 n_genome=n_single+n_hetero+n_homo
-n_target=10 #Desired number in each catagory (maybe make this 11 options for the single items)
+n_target=10 #Desired number in each catagory
+chromosomeDict={0:n_single, 1:n_hetero, 2:n_homo}
+
+random.seed()
 
-#%%"""===========define fitness and functions================="""
+#%%===========define fitness and functions=================%%#
 
 print 'defining functions'
 
 # Fitness Function
 def evalMax(individual):
     indiv=dictionaryLookup(individual)
 
-    rangeCost=np.power((np.ptp(indiv[0])+np.ptp(indiv[1])+np.ptp(indiv[2])), 2)
-    distanceCost=0
-    normalityCost=mstats.normaltest(indiv[0])[1]+mstats.normaltest(indiv[1])[1]+mstats.normaltest(indiv[2])[1]
-    cost=400*rangeCost+distanceCost+100*normalityCost
+    similarityCost= 15/np.sum(np.in1d(individual[0][0],individual[0][1]))
+    rangeCost=1/(np.ptp(indiv[0])+np.ptp(indiv[1])+np.ptp(indiv[2]))
+    uniformCost=1/(kstest(indiv[0],'uniform')[1]+kstest(indiv[1],'uniform')[1]+kstest(indiv[2],'uniform')[1]+.00001)
+    distanceCost=1/(ks_2samp(indiv[0], indiv[1])[1]+ks_2samp(indiv[1], indiv[2])[1]+ks_2samp(indiv[2], indiv[0])[1])
+    cost=rangeCost+1.5*uniformCost+distanceCost+similarityCost
     return (cost,)
 
-# Creates the initial generation     
-def createIndividual(species):
+# Creates the initial generation      
+def createIndividual():
     #Creates an individual with unique sample
-    return [random.sample(range(n_single),n_target), 
-                   random.sample(range(n_hetero),n_target),
-                    random.sample(range(n_homo), n_target)]
+    x=[np.random.choice(valueDictionary[1].keys(),n_target+1),
+               np.random.choice(valueDictionary[2].keys(),n_target),
+                np.random.choice(valueDictionary[3].keys(),n_target)]
+
+    return[np.random.choice(valueDictionary[1].keys(),n_target+1),
+               np.random.choice(valueDictionary[2].keys(),n_target),
+                np.random.choice(valueDictionary[3].keys(),n_target)]
 
-# Crossover algo
+
+# Crossover algorithm          
 def nonReplicatingCross(ind1, ind2):
     chromosomeNumber = random.randint(0,2)
     indLength = len(ind1[chromosomeNumber])
@@ -64,18 +78,21 @@ def nonReplicatingCross(ind1, ind2):
     child2 = np.zeros(indLength)
     child1[0:cxpoint]=ind2[chromosomeNumber][0:cxpoint]
     child2[0:cxpoint]=ind1[chromosomeNumber][0:cxpoint]
-    index1=0
-    index2=0
+    index1, index2=0,0
+    cont1, cont2=1,1
     for item1 in ind1[chromosomeNumber]:
         if cxpoint+index1==indLength:
             break
         try:
             np.where(child1==item1)[0][0]
+            continue
         except IndexError:
             child1[cxpoint+index1]= item1
             index1=index1+1
-
-
+            cont1==0
+    x=np.where(child1==0)[0]
+    for i in x:
+        child1[i]=np.random.choice(valueDictionary[chromosomeNumber+1].keys())
     for item2 in ind2[chromosomeNumber]:
         if cxpoint+index2==indLength:
             break
@@ -84,70 +101,84 @@ def nonReplicatingCross(ind1, ind2):
         except IndexError:
             child2[cxpoint+index2]= item2
             index2=index2+1
+            cont2==0
+    x=np.where(child2==0)[0]
+    for i in x:
+        child2[i]=np.random.choice(valueDictionary[chromosomeNumber+1].keys())
     ind1[chromosomeNumber]=child1  
     ind2[chromosomeNumber]=child2
-    index1=0
-    index2=0
-
+    index1, index2=0,0
+
+           
     return ind1, ind2
-        
-# Mutation fucntion
+
+#Mutation algorithm      
 def nonReplicatingMutate(ind,indpb):
+
     ind=np.asarray(ind)
-    for i in range(1,len(ind[0])):
-        if random.random() < indpb:
-            working = True 
-            while working:
-                randomN=random.randint(0,n_single)
-                try:
-                    np.where(ind[0]==randomN)[0][0]
-                except IndexError:
-                    ind[0][i]=randomN
-                    working=False
-    for i in range(1,len(ind[1])):
-        if random.random() < indpb:
-            working = True 
-            while working:
-                randomN=random.randint(0,n_hetero)
-                try:
-                    np.where(ind[1]==randomN)[0][0]
-                except IndexError:
-                    ind[1][i]=randomN
-                    working=False
-    for i in range(1,len(ind[2])):
-        if random.random() < indpb:
-            working = True 
-            while working:
-                randomN=random.randint(0,n_homo)
-                try:
-                    np.where(ind[2]==randomN)[0][0]
-                except IndexError:
-                    ind[2][i]=randomN
-                    working=False
+    for chro in range(0,3):
+        for i in range(1,len(ind[chro])):
+                if random.random() < indpb:
+                    working = True 
+                    while working:
+                        randomN=np.random.choice(valueDictionary[chro+1].keys())
+                        if randomN in valueDictionary[chro+1].keys()==0:
+                            print 'noooooooo'
+                        try:
+                            np.where(ind[0]==randomN)[0][0]
+                        except IndexError:
+                            ind[chro][i]=randomN
+                            working=False        
     return ind
     del ind
 
-
+#Maps genotype onto phenotype (item number onto value)    
 def dictionaryLookup(individual):
-    indiv=np.ndarray(shape=(3,10), dtype=float)
+    indiv=[np.zeros(n_target+1), np.zeros(n_target), np.zeros(n_target)]
     for chro in range(0,3):
         for i in range(len(individual[0][chro])):
             indiv[chro][i]=valueDictionary[chro+1][individual[0][chro][i]]
     return indiv
+
+#stores top n individuals of an epoch in a list    
+def custHallOfFame(population,maxaddsize):
+    for i in tools.selBest(population, k=maxaddsize): 
+        HallOfFame.append(i)
+
 
-#%%"""==============import data from csv======================"""
+#%%==============import data from csv======================%%#
+raw_choice_dataset = pd.read_csv(csv_filepath, sep=',', header=None)
+print raw_choice_dataset
 
+raw_choice_dataset=raw_choice_dataset[raw_choice_dataset[0]==SID]
+
+valueDictionary={}
+for x in range(1,4):
+    placeholderValueDictionary={}
+    for rows in raw_choice_dataset[raw_choice_dataset[3].astype(int)==x].iterrows():
+        rows[1][6]=random.randint(100,140) # change this once modeling is done
+        placeholderValueDictionary[int(rows[1][1])] =float(rows[1][6])
+    valueDictionary[x]=placeholderValueDictionary
+
+bundleLookup={}
+for x in raw_choice_dataset[raw_choice_dataset[3].astype(int)==3].iterrows():
+    bundleLookup[int(x[1][1])]=int(x[1][4]),int(x[1][5])
+
+
+
+"""
 raw_choice_dataset = pd.read_csv(csv_filepath, sep=',', header=None)
 raw_choice_dataset.dropna()
 
 valueDictionary={}
 for x in range(1,4):
     placeholderValueDictionary={}
     for i in raw_choice_dataset[(raw_choice_dataset[1]==x)].index:
-        placeholderValueDictionary[raw_choice_dataset[0][i]] = raw_choice_dataset[2][i]
+        placeholderValueDictionary[raw_choice_dataset[0][i]] = random.random()
     valueDictionary[x]=placeholderValueDictionary
+"""
+#%%===============initialize toolbox=======================%%#
 
-#%%"""===============initialize toolbox======================="""
 print 'initializing'
 creator.create("FitnessMax", base.Fitness, weights=(-1.0,))
 creator.create("Individual", list, typecode="d", fitness=creator.FitnessMax)
@@ -157,11 +188,10 @@ def dictionaryLookup(individual):
 stats.register("mean", np.mean)
 stats.register("min", np.min)
 
-HOF = tools.HallOfFame(maxsize=10)
-
 toolbox = base.Toolbox()
 
-toolbox.register("create_individual", createIndividual, 1)
+toolbox.register("HOF", custHallOfFame, maxaddsize=3)
+toolbox.register("create_individual", createIndividual)
 toolbox.register("individuals", tools.initRepeat, creator.Individual,
                  toolbox.create_individual, n=1) 
 toolbox.register("population", tools.initRepeat, list, toolbox.individuals)
@@ -173,55 +203,70 @@ def dictionaryLookup(individual):
 toolbox.register("select", tools.selTournament, tournsize=2)
 
 toolbox.register("map", futures.map)
-#%%"""======================main=============================="""
+
+#%%======================main==============================%%#
+
 print 'main'
+
 if __name__ == "__main__":
-    pop = toolbox.population(n=200)   
-    ngen, cxpb, mutpb = 80, 0.4, 0.2
-
-    fitnesses = toolbox.map(toolbox.evaluate, pop)
-
-    for ind, fit in zip(pop, fitnesses):
-        ind.fitness.values = fit
-    print 'beginning'
-
-    for g in range(ngen):  
-        print g        
-        offspring = toolbox.select(pop, len(pop))
-        offspring = map(toolbox.clone, offspring)
+    for epoch in xrange(nepochs):
+        print 'beginning epoch %s of %s' %(epoch+1,nepochs)
+        pop = toolbox.population(n=npop)
+
 
-        for child1, child2 in zip(offspring[::2], offspring[1::2]):
-            if random.random() < cxpb:
-                child1[0], child2[0] = toolbox.mate(child1[0], child2[0])
-                del child1.fitness.values, child2.fitness.values
-
-        for mutant in offspring:
-            if random.random() < mutpb:
-                mutant[0]=toolbox.mutate(mutant[0])
-                del mutant.fitness.values      
+        fitnesses = toolbox.map(toolbox.evaluate, pop)
 
-        invalids = [ind for ind in offspring if not ind.fitness.valid]
-        fitnesses = toolbox.map(toolbox.evaluate, invalids)
-        for ind, fit in zip(invalids, fitnesses):
-            ind.fitness.values = fit  
+        for ind, fit in zip(pop, fitnesses):
+            ind.fitness.values = fit
+
+        for g in range(ngen):  
+            if g%5==0:
+                print g        
+            offspring = toolbox.select(pop, len(pop))
+            offspring = map(toolbox.clone, offspring)
 
-        pop[:] = offspring
+            for child1, child2 in zip(offspring[::2], offspring[1::2]):
+                if random.random() < cxpb:
+                    child1[0], child2[0] = toolbox.mate(child1[0], child2[0])
+                    del child1.fitness.values, child2.fitness.values
 
-        a=tools.selBest(pop,k=1)[0][0]
-        print a
-        print np.sum(tools.selBest(pop,k=1)[0][0])
+            for mutant in offspring:
+                if random.random() < mutpb:
+                    mutant[0]=toolbox.mutate(mutant[0])
+                    del mutant.fitness.values      
+
+            invalids = [ind for ind in offspring if not ind.fitness.valid]
+            fitnesses = toolbox.map(toolbox.evaluate, invalids)
+            for ind, fit in zip(invalids, fitnesses):
+                ind.fitness.values = fit  
+
+            pop[:] = offspring
+
         a=dictionaryLookup(tools.selBest(pop,k=1)[0])
-
-        for i in range(0,3):
-            ones=np.ones(1000)
-            kde=KernelDensity(kernel='gaussian', bandwidth=.05).fit(zip(a[i], ones))
-            x_grid=np.linspace(0, 1, 1000)
-            pdf = np.exp(kde.score_samples(zip(x_grid, ones)))  
-            plt.plot(x_grid, pdf, linewidth=3, alpha=.5)
 
+        toolbox.HOF(pop)
+        del pop
+
+
+#%%===============reporting and graphing======================%%#
+#Graph, color order - blue green red purple gold light_blue
+f, axes=plt.subplots(len(HallOfFame),1, figsize=(7,2*len(HallOfFame)))
+plt.tight_layout(pad=3)
+num=0
+
+with open('output.txt', 'w') as output_text:
+    output_text.write("Results for %s individuals, %s generations and %s epochs\n%s\n" %(npop,ngen,nepochs, SID))
+    for x in HallOfFame:
+        for i in dictionaryLookup(x):
+            seaborn.kdeplot(i, shade=True, bw=10, ax=axes[num])
+
+        axes[num].set_title("{0:.3f}".format(evalMax(x)[0]))
+        if num==0:
+            axes[num].set_title("Graphs of Possible Solutions\n{0} individuals, {1} generations, {2} epoch \n\n\n\n{3:.3f}".format(npop,ngen, nepochs,evalMax(x)[0]))    
+        output_text.write('%s. Similarity- %s, %s\n'%(num,np.sum(np.in1d(x[0][0],x[0][1])),np.sum(np.in1d(x[0][1],x[0][2]))))
+        output_text.write("%s\n\n" %x[0])
+        num=num+1
 
-#%% TO-DO's
+plt.savefig('/home/brain/Desktop/Custom MVPA Workflow/multipage.pdf', format='pdf', bbox_inches='tight', pad_inches=1)
 
-# make the single chromisome 11 long
-# add a term to the fitness function for the "buddy" items.
-# Jensen-Shannon? 
+print 'Program complete'