Skip to content

Commit

Permalink
Fix coclustering instability errors
Browse files Browse the repository at this point in the history
Correction dans les algorithmes d'optimisation:
- partout, tester l'amelioration du cout en remplacant ~if (dCost < dBestCost)~ par ~if (dCost < dBestCost - dEpsilon)~
- correction principale dans KWDataGridOptimizer::OptimizeDataGrid
- impacts pour propager la correction partout
  - DTDiscretizerMODL::DiscretizeNEW
  - DTDiscretizerMODL::DiscretizeOLD
  - DTDiscretizerMODL::DiscretizeGranularizedFrequencyTableNEW (pour le nul cost: < dBestCost + dEpsilon)
  - DTGrouperMODL::GroupPreprocessedTable
  - DTGrouperMODL::SmallSourceNumberGroup
  - KWDiscretizerMODL::Discretize
  - KWGrouperMODL::GroupPreprocessedTable
  - KWGrouperMODL::SmallSourceNumberGroup
  - KWDensityEstimationTest::SearchBestInstanceGridSize
  - MHDiscretizerHistogramMODL::GranularizedDiscretizeValues
  - MHDiscretizerHistogramMODL_fp::OptimizeGranularity
  - MHFloatingPointFrequencyTableBuilder::InitializeDomainBounds

Stabilisation lie au probleme de choix d'une partition aleatoire, base sur un Shuffle, puis un Sort
- le sort est instable entre Windows et Linux en cas d'egalite du critere
- correction en memoirsant un index aleatoire suite au Shuffle, puis en utilisant cet index en critere de tri secondaire
  - KWDataGridManager::SortAttributeParts: reimplementation du tri avec random index en cas d'egalite
  - KWSortableSymbolCompareValue: utilisation de l'index comme critere de tri secondaire

Teste sur les 12 jeux donnees instables entre Windows, Linux, Mac
- quelques jeux de tests de reference ont change
- les resultats sont maintenant identique sur les trois OS

Teste sur LearningTest entier

Memorisation des jeux de test pour les test de non regression sur git
- test\LearningTest\TestKhiops\Standard\IrisU
  - deplace depuis Standard-unstable, desormais supprime
- test\LearningTest\TestCoclustering\Standard\Adult2varsTiny
  - deplace depuis Standard-unstable, desormais supprime

Correction a reporter en V11
  • Loading branch information
marcboulle committed Jan 17, 2024
1 parent b4c9b39 commit 133ec96
Show file tree
Hide file tree
Showing 29 changed files with 4,664 additions and 4,636 deletions.
8 changes: 4 additions & 4 deletions src/Learning/DTForest/DTDiscretizerMODL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ void DTDiscretizerMODL::DiscretizeNEW(KWFrequencyTable* kwftSource, KWFrequencyT
// delete kwftMergedTable;
// kwftMergedTable = NULL;

if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;
// Destruction de l'optimum precedent
Expand Down Expand Up @@ -469,7 +469,7 @@ void DTDiscretizerMODL::DiscretizeOLD(KWFrequencyTable* kwftSource, KWFrequencyT
delete kwftMergedTable;
kwftMergedTable = NULL;

if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;
// Destruction de l'optimum precedent
Expand Down Expand Up @@ -630,7 +630,7 @@ void DTDiscretizerMODL::DiscretizeGranularizedFrequencyTableNEW(KWFrequencyTable
kwdfvSourceFrequencyVectortarget1->CopyFrom(kwdfvSourceFrequencyVector1);
nvfmax = nfv;
}
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
// cout << "cout best " << nfv + 1 << " : " << dCost << endl;
dBestCost = dCost;
Expand All @@ -643,7 +643,7 @@ void DTDiscretizerMODL::DiscretizeGranularizedFrequencyTableNEW(KWFrequencyTable

// cout << "nvfmax / nsourcesize : " << nvfmax << " / " << nsourcesize << endl;

if (dCostnull < dBestCost)
if (dCostnull < dBestCost + dEpsilon)
{
// cout << "NULL Cost" << endl;
// cout << "cout best " << nfv + 1 << " : " << dCost << endl;
Expand Down
8 changes: 4 additions & 4 deletions src/Learning/DTForest/DTGrouperMODL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ void DTGrouperMODL::GroupPreprocessedTable(KWFrequencyTable* kwftSource, KWFrequ
dCost = ComputeGroupingCost(kwftOptimizedGranularizedTable, nCurrentPartileNumber);

// Cas de l'amelioration du cout
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
// Memorisation du cout optimal
dBestCost = dCost;
Expand Down Expand Up @@ -715,7 +715,7 @@ void DTGrouperMODL::SmallSourceNumberGroup(KWFrequencyTable* kwftSource, KWFrequ
MergeFrequencyVectors(workingFrequencyVector, kwftSource->GetFrequencyVectorAt(1),
kwftSource->GetFrequencyVectorAt(2));
dCost = ComputeGroupCost(workingFrequencyVector) + dCostGroup0;
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;
nBestTwoGroupsIndex = 0;
Expand All @@ -724,7 +724,7 @@ void DTGrouperMODL::SmallSourceNumberGroup(KWFrequencyTable* kwftSource, KWFrequ
MergeFrequencyVectors(workingFrequencyVector, kwftSource->GetFrequencyVectorAt(0),
kwftSource->GetFrequencyVectorAt(2));
dCost = ComputeGroupCost(workingFrequencyVector) + dCostGroup1;
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;
nBestTwoGroupsIndex = 1;
Expand All @@ -733,7 +733,7 @@ void DTGrouperMODL::SmallSourceNumberGroup(KWFrequencyTable* kwftSource, KWFrequ
MergeFrequencyVectors(workingFrequencyVector, kwftSource->GetFrequencyVectorAt(0),
kwftSource->GetFrequencyVectorAt(1));
dCost = ComputeGroupCost(workingFrequencyVector) + dCostGroup2;
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;
nBestTwoGroupsIndex = 2;
Expand Down
14 changes: 11 additions & 3 deletions src/Learning/KWData/KWSortableIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,18 @@ int KWSortableSymbolCompare(const void* elem1, const void* elem2)

int KWSortableSymbolCompareValue(const void* elem1, const void* elem2)
{
int nCompare;

// Comparaison sur le critere de tri
return cast(KWSortableSymbol*, *(Object**)elem1)
->GetSortValue()
.CompareValue(cast(KWSortableSymbol*, *(Object**)elem2)->GetSortValue());
nCompare = cast(KWSortableSymbol*, *(Object**)elem1)
->GetSortValue()
.CompareValue(cast(KWSortableSymbol*, *(Object**)elem2)->GetSortValue());

// Comparaison sur l'index si egal
if (nCompare == 0)
nCompare = cast(KWSortableSymbol*, *(Object**)elem1)->GetIndex() -
cast(KWSortableSymbol*, *(Object**)elem2)->GetIndex();
return nCompare;
}

int KWSortableSymbolCompareDecreasingIndexValue(const void* elem1, const void* elem2)
Expand Down
2 changes: 1 addition & 1 deletion src/Learning/KWData/KWSortableIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ class KWSortableSymbol : public KWSortableIndex
// Comparaison de deux objets KWSortableSymbol par reference
int KWSortableSymbolCompare(const void* elem1, const void* elem2);

// Comparaison de deux objets KWSortableSymbol par valeur
// Comparaison de deux objets KWSortableSymbol par valeur, puis par index croissant
int KWSortableSymbolCompareValue(const void* elem1, const void* elem2);

// Comparaison de deux objets KWSortableSymbol par index decroissant, puis par valeur
Expand Down
4 changes: 4 additions & 0 deletions src/Learning/KWDataPreparation/KWDataGrid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3558,6 +3558,8 @@ KWDGValue* KWDGValueSet::AddValue(const Symbol& sValue)
{
KWDGValue* value;

require(not bIsDefaultPart or sValue != Symbol::GetStarValue());

// Creation de la valeur
value = NewValue(sValue);

Expand All @@ -3584,6 +3586,8 @@ void KWDGValueSet::DeleteValue(KWDGValue* value)
{
require(value != NULL);

require(not bIsDefaultPart or value->GetValue() != Symbol::GetStarValue());

// Supression de la liste des valuees
nValueNumber--;
if (value->prevValue != NULL)
Expand Down
24 changes: 18 additions & 6 deletions src/Learning/KWDataPreparation/KWDataGridManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2921,10 +2921,12 @@ void KWDataGridManager::SortAttributeParts(KWDGAttribute* sourceAttribute, KWDGA
ObjectArray oaSourceParts;
ObjectArray oaAssociations;
KWSortableSymbol* association;
int nSource;
int n;
KWDGPart* sourcePart;
KWDGPart* groupedPart;
IntVector ivRandomIndexes;
int nSource;
int nRandomIndex;
int n;

require(sourceAttribute != NULL);
require(groupedAttribute != NULL);
Expand All @@ -2943,11 +2945,21 @@ void KWDataGridManager::SortAttributeParts(KWDGAttribute* sourceAttribute, KWDGA
// On exporte les parties sources dans un tableau
sourceAttribute->ExportParts(&oaSourceParts);

// Construction d'un vecteur d'index des parties source pour les gerer en ordre aleatoire
ivRandomIndexes.SetSize(oaSourceParts.GetSize());
for (n = 0; n < ivRandomIndexes.GetSize(); n++)
ivRandomIndexes.SetAt(n, n);
ivRandomIndexes.Shuffle();

// Initialisation d'un tableau d'associations entre index de partie source et
// (premiere) valeur de groupe source
oaAssociations.SetSize(oaSourceParts.GetSize());
for (nSource = 0; nSource < oaSourceParts.GetSize(); nSource++)
for (n = 0; n < ivRandomIndexes.GetSize(); n++)
{
// Attention, le RandomIndex est ici l'index de parcours du vecteur ivRandomIndexes
// Qui vient d'etre perturbe aleatoirement
nRandomIndex = n;
nSource = ivRandomIndexes.GetAt(nRandomIndex);
sourcePart = cast(KWDGPart*, oaSourceParts.GetAt(nSource));

// Recherche de la partie groupee correspondante
Expand All @@ -2956,12 +2968,11 @@ void KWDataGridManager::SortAttributeParts(KWDGAttribute* sourceAttribute, KWDGA
// Creation de l'association entre index de partie et premiere valeur du groupe
association = new KWSortableSymbol;
oaAssociations.SetAt(nSource, association);
association->SetIndex(nSource);
association->SetIndex(nRandomIndex);
association->SetSortValue(groupedPart->GetValueSet()->GetHeadValue()->GetValue());
}

// Tri des association, apres une randomisation pour avoir un ordre aleatoire par groupe
oaAssociations.Shuffle();
oaAssociations.SetCompareFunction(KWSortableSymbolCompareValue);
oaAssociations.Sort();

Expand All @@ -2974,7 +2985,8 @@ void KWDataGridManager::SortAttributeParts(KWDGAttribute* sourceAttribute, KWDGA
association = cast(KWSortableSymbol*, oaAssociations.GetAt(n));

// Recherche de la partie source
nSource = association->GetIndex();
nRandomIndex = association->GetIndex();
nSource = ivRandomIndexes.GetAt(nRandomIndex);
sourcePart = cast(KWDGPart*, oaSourceParts.GetAt(nSource));

// Recherche de la partie groupee correspondante
Expand Down
2 changes: 1 addition & 1 deletion src/Learning/KWDataPreparation/KWDataGridOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,7 @@ double KWDataGridOptimizer::OptimizeDataGrid(const KWDataGrid* initialDataGrid,
*/
{
// Cas d'amelioration du cout
if (dGranularityBestCost < dBestCost)
if (dGranularityBestCost < dBestCost - dEpsilon)
{
dBestCost = dGranularityBestCost;

Expand Down
2 changes: 1 addition & 1 deletion src/Learning/KWDataPreparation/KWDiscretizerMODL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ void KWDiscretizerMODL::Discretize(KWFrequencyTable* kwftSource, KWFrequencyTabl
delete kwftMergedTable;
kwftMergedTable = NULL;

if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;
// Destruction de l'optimum precedent
Expand Down
8 changes: 4 additions & 4 deletions src/Learning/KWDataPreparation/KWGrouperMODL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,7 @@ void KWGrouperMODL::GroupPreprocessedTable(KWFrequencyTable* kwftSource, KWFrequ
dCost = ComputeGroupingCost(kwftOptimizedGranularizedTable, nCurrentPartileNumber);

// Cas de l'amelioration du cout
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
// Memorisation du cout optimal
dBestCost = dCost;
Expand Down Expand Up @@ -813,7 +813,7 @@ void KWGrouperMODL::SmallSourceNumberGroup(KWFrequencyTable* kwftSource, KWFrequ
MergeFrequencyVectors(workingFrequencyVector, kwftSource->GetFrequencyVectorAt(1),
kwftSource->GetFrequencyVectorAt(2));
dCost = ComputeGroupCost(workingFrequencyVector) + dCostGroup0;
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;
nBestTwoGroupsIndex = 0;
Expand All @@ -822,7 +822,7 @@ void KWGrouperMODL::SmallSourceNumberGroup(KWFrequencyTable* kwftSource, KWFrequ
MergeFrequencyVectors(workingFrequencyVector, kwftSource->GetFrequencyVectorAt(0),
kwftSource->GetFrequencyVectorAt(2));
dCost = ComputeGroupCost(workingFrequencyVector) + dCostGroup1;
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;
nBestTwoGroupsIndex = 1;
Expand All @@ -831,7 +831,7 @@ void KWGrouperMODL::SmallSourceNumberGroup(KWFrequencyTable* kwftSource, KWFrequ
MergeFrequencyVectors(workingFrequencyVector, kwftSource->GetFrequencyVectorAt(0),
kwftSource->GetFrequencyVectorAt(1));
dCost = ComputeGroupCost(workingFrequencyVector) + dCostGroup2;
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;
nBestTwoGroupsIndex = 2;
Expand Down
2 changes: 1 addition & 1 deletion src/Learning/KWTest/KWDensityEstimationTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ int KWDensityEstimationTest::SearchBestInstanceGridSize(ContinuousVector* cvXVal
cout << nAxisCellNumber << "\t" << dCost << endl;

// Test si amelioration
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;
nBestAxisCellNumber = nAxisCellNumber;
Expand Down
2 changes: 1 addition & 1 deletion src/Learning/MHHistograms/MHDiscretizerHistogramMODL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,7 @@ void MHDiscretizerHistogramMODL::GranularizedDiscretizeValues(const ContinuousVe
<< KWContinuous::ContinuousToString(dCost) << endl;

// Memorisation si amelioration du cout
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;
if (optimizedHistogramFrequencyTable != NULL)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -743,7 +743,7 @@ void MHDiscretizerHistogramMODL_fp::OptimizeGranularity(MHHistogram*& optimizedH
delete optimizedHistogramFrequencyTable;

// Memorisation si amelioration du cout
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2127,6 +2127,7 @@ int MHFloatingPointFrequencyTableBuilder::SearchBinIndex(int nSearchedCumulative
void MHFloatingPointFrequencyTableBuilder::InitializeDomainBounds()
{
boolean bDisplay = false;
const double dEpsilon = 1e-7;
int nTotalFrequency;
int i;
double dBestCost;
Expand Down Expand Up @@ -2172,13 +2173,13 @@ void MHFloatingPointFrequencyTableBuilder::InitializeDomainBounds()
cout << KWContinuous::ContinuousToString(cUpperBound - GetMaxValue()) << "\t";
cout << dCost << "\t";
cout << dBestCost << "\t";
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
cout << "Best";
cout << "\n";
}

// Memorisation si amelioration
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;

Expand Down
2 changes: 2 additions & 0 deletions src/Learning/MODL/MODL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ void SetWindowsDebugDir(const ALString& sDatasetFamily, const ALString& sDataset
int nRet;

// A parametrer pour chaque utilisateur
// Devra etre fait plus proprement quand tout l'equipe sera sur git, par exemple via une variable
// d'environnement et quelques commentaires clairs
sUserRootPath = "D:/Users/miib6422/Documents/boullema/LearningTest/TestKhiops/";

// Pour permettre de continuer a utiliser LearningTest, on ne fait rien s'il y a deja un fichier test.prm
Expand Down
2 changes: 2 additions & 0 deletions src/Learning/MODL_Coclustering/MODL_Coclustering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ void SetWindowsDebugDir(const ALString& sDatasetFamily, const ALString& sDataset
int nRet;

// A parametrer pour chaque utilisateur
// Devra etre fait plus proprement quand tout l'equipe sera sur git, par exemple via une variable
// d'environnement et quelques commentaires clairs
sUserRootPath = "D:/Users/miib6422/Documents/boullema/LearningTest/TestCoclustering/";

// Pour permettre de continuer a utiliser LearningTest, on ne fait rien s'il y a deja un fichier test.prm
Expand Down

This file was deleted.

Loading

0 comments on commit 133ec96

Please sign in to comment.