Skip to content

Commit

Permalink
Merge pull request #124 from KhiopsML/122-fix-datagrid-instability
Browse files Browse the repository at this point in the history
Fix coclustering and isprint instabilities
  • Loading branch information
marcboulle authored Jan 17, 2024
2 parents a767b14 + 133ec96 commit 3bc17ff
Show file tree
Hide file tree
Showing 37 changed files with 4,738 additions and 4,648 deletions.
8 changes: 4 additions & 4 deletions src/Learning/DTForest/DTDiscretizerMODL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ void DTDiscretizerMODL::DiscretizeNEW(KWFrequencyTable* kwftSource, KWFrequencyT
// delete kwftMergedTable;
// kwftMergedTable = NULL;

if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;
// Destruction de l'optimum precedent
Expand Down Expand Up @@ -469,7 +469,7 @@ void DTDiscretizerMODL::DiscretizeOLD(KWFrequencyTable* kwftSource, KWFrequencyT
delete kwftMergedTable;
kwftMergedTable = NULL;

if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;
// Destruction de l'optimum precedent
Expand Down Expand Up @@ -630,7 +630,7 @@ void DTDiscretizerMODL::DiscretizeGranularizedFrequencyTableNEW(KWFrequencyTable
kwdfvSourceFrequencyVectortarget1->CopyFrom(kwdfvSourceFrequencyVector1);
nvfmax = nfv;
}
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
// cout << "cout best " << nfv + 1 << " : " << dCost << endl;
dBestCost = dCost;
Expand All @@ -643,7 +643,7 @@ void DTDiscretizerMODL::DiscretizeGranularizedFrequencyTableNEW(KWFrequencyTable

// cout << "nvfmax / nsourcesize : " << nvfmax << " / " << nsourcesize << endl;

if (dCostnull < dBestCost)
if (dCostnull < dBestCost + dEpsilon)
{
// cout << "NULL Cost" << endl;
// cout << "cout best " << nfv + 1 << " : " << dCost << endl;
Expand Down
8 changes: 4 additions & 4 deletions src/Learning/DTForest/DTGrouperMODL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ void DTGrouperMODL::GroupPreprocessedTable(KWFrequencyTable* kwftSource, KWFrequ
dCost = ComputeGroupingCost(kwftOptimizedGranularizedTable, nCurrentPartileNumber);

// Cas de l'amelioration du cout
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
// Memorisation du cout optimal
dBestCost = dCost;
Expand Down Expand Up @@ -715,7 +715,7 @@ void DTGrouperMODL::SmallSourceNumberGroup(KWFrequencyTable* kwftSource, KWFrequ
MergeFrequencyVectors(workingFrequencyVector, kwftSource->GetFrequencyVectorAt(1),
kwftSource->GetFrequencyVectorAt(2));
dCost = ComputeGroupCost(workingFrequencyVector) + dCostGroup0;
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;
nBestTwoGroupsIndex = 0;
Expand All @@ -724,7 +724,7 @@ void DTGrouperMODL::SmallSourceNumberGroup(KWFrequencyTable* kwftSource, KWFrequ
MergeFrequencyVectors(workingFrequencyVector, kwftSource->GetFrequencyVectorAt(0),
kwftSource->GetFrequencyVectorAt(2));
dCost = ComputeGroupCost(workingFrequencyVector) + dCostGroup1;
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;
nBestTwoGroupsIndex = 1;
Expand All @@ -733,7 +733,7 @@ void DTGrouperMODL::SmallSourceNumberGroup(KWFrequencyTable* kwftSource, KWFrequ
MergeFrequencyVectors(workingFrequencyVector, kwftSource->GetFrequencyVectorAt(0),
kwftSource->GetFrequencyVectorAt(1));
dCost = ComputeGroupCost(workingFrequencyVector) + dCostGroup2;
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;
nBestTwoGroupsIndex = 2;
Expand Down
4 changes: 2 additions & 2 deletions src/Learning/KWDRRuleLibrary/KWDRStringEncrypt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ void KWDREncrypt::InitWorkingArrays(const Symbol& sKey) const
{
if (isalnum(i))
ivPureAlphanumChars.Add(i);
if (isprint(i) and not isalnum(i))
if (p_isprint(i) and not isalnum(i))
ivPrintableNonAlphanumChars.Add(i);
}

Expand Down Expand Up @@ -365,7 +365,7 @@ void KWDREncrypt::InitWorkingArrays(const Symbol& sKey) const
{
// Caractere non imprimable transforme en blanc
c = i;
if (c >= 128 or not isprint(c))
if (c >= 128 or not p_isprint(c))
c = ' ';

// Prefixe underscore rajoute
Expand Down
4 changes: 2 additions & 2 deletions src/Learning/KWData/KWCLex.inc
Original file line number Diff line number Diff line change
Expand Up @@ -1141,7 +1141,7 @@ YY_RULE_SETUP

// Initialisation de la valeur du token
c = yytext[0];
if (not isprint(c))
if (not p_isprint(c))
{
sToken += '[';
sToken += IntToString((int)c);
Expand All @@ -1160,7 +1160,7 @@ YY_RULE_SETUP
nCorrectedLineNumber--;
break;
}
if (not isprint(c))
if (not p_isprint(c))
{
if (sToken.GetLength() < nMaxLength)
{
Expand Down
4 changes: 2 additions & 2 deletions src/Learning/KWData/KWCLex.lex
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ name {letter}({letter}|{digit})*

// Initialisation de la valeur du token
c = yytext[0];
if (not isprint(c))
if (not p_isprint(c))
{
sToken += '[';
sToken += IntToString((int)c);
Expand All @@ -271,7 +271,7 @@ name {letter}({letter}|{digit})*
nCorrectedLineNumber--;
break;
}
if (not isprint(c))
if (not p_isprint(c))
{
if (sToken.GetLength() < nMaxLength)
{
Expand Down
2 changes: 1 addition & 1 deletion src/Learning/KWData/KWDatabaseFormatDetector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1227,7 +1227,7 @@ int KWDatabaseFormatDetector::ComputeSeparatorPriority(char cSeparator) const
// Si non trouve, on prend le le cracater lui meme d'abord dans sa plage ascii, puis dans la plage ascii etendue
if (nPriority == -1)
{
if (isprint(cSeparator))
if (p_isprint(cSeparator))
{
if (cSeparator >= 0)
nPriority = 1000 + cSeparator;
Expand Down
14 changes: 11 additions & 3 deletions src/Learning/KWData/KWSortableIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,18 @@ int KWSortableSymbolCompare(const void* elem1, const void* elem2)

int KWSortableSymbolCompareValue(const void* elem1, const void* elem2)
{
int nCompare;

// Comparaison sur le critere de tri
return cast(KWSortableSymbol*, *(Object**)elem1)
->GetSortValue()
.CompareValue(cast(KWSortableSymbol*, *(Object**)elem2)->GetSortValue());
nCompare = cast(KWSortableSymbol*, *(Object**)elem1)
->GetSortValue()
.CompareValue(cast(KWSortableSymbol*, *(Object**)elem2)->GetSortValue());

// Comparaison sur l'index si egal
if (nCompare == 0)
nCompare = cast(KWSortableSymbol*, *(Object**)elem1)->GetIndex() -
cast(KWSortableSymbol*, *(Object**)elem2)->GetIndex();
return nCompare;
}

int KWSortableSymbolCompareDecreasingIndexValue(const void* elem1, const void* elem2)
Expand Down
2 changes: 1 addition & 1 deletion src/Learning/KWData/KWSortableIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ class KWSortableSymbol : public KWSortableIndex
// Comparaison de deux objets KWSortableSymbol par reference
int KWSortableSymbolCompare(const void* elem1, const void* elem2);

// Comparaison de deux objets KWSortableSymbol par valeur
// Comparaison de deux objets KWSortableSymbol par valeur, puis par index croissant
int KWSortableSymbolCompareValue(const void* elem1, const void* elem2);

// Comparaison de deux objets KWSortableSymbol par index decroissant, puis par valeur
Expand Down
4 changes: 4 additions & 0 deletions src/Learning/KWDataPreparation/KWDataGrid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3558,6 +3558,8 @@ KWDGValue* KWDGValueSet::AddValue(const Symbol& sValue)
{
KWDGValue* value;

require(not bIsDefaultPart or sValue != Symbol::GetStarValue());

// Creation de la valeur
value = NewValue(sValue);

Expand All @@ -3584,6 +3586,8 @@ void KWDGValueSet::DeleteValue(KWDGValue* value)
{
require(value != NULL);

require(not bIsDefaultPart or value->GetValue() != Symbol::GetStarValue());

// Supression de la liste des valuees
nValueNumber--;
if (value->prevValue != NULL)
Expand Down
24 changes: 18 additions & 6 deletions src/Learning/KWDataPreparation/KWDataGridManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2921,10 +2921,12 @@ void KWDataGridManager::SortAttributeParts(KWDGAttribute* sourceAttribute, KWDGA
ObjectArray oaSourceParts;
ObjectArray oaAssociations;
KWSortableSymbol* association;
int nSource;
int n;
KWDGPart* sourcePart;
KWDGPart* groupedPart;
IntVector ivRandomIndexes;
int nSource;
int nRandomIndex;
int n;

require(sourceAttribute != NULL);
require(groupedAttribute != NULL);
Expand All @@ -2943,11 +2945,21 @@ void KWDataGridManager::SortAttributeParts(KWDGAttribute* sourceAttribute, KWDGA
// On exporte les parties sources dans un tableau
sourceAttribute->ExportParts(&oaSourceParts);

// Construction d'un vecteur d'index des parties source pour les gerer en ordre aleatoire
ivRandomIndexes.SetSize(oaSourceParts.GetSize());
for (n = 0; n < ivRandomIndexes.GetSize(); n++)
ivRandomIndexes.SetAt(n, n);
ivRandomIndexes.Shuffle();

// Initialisation d'un tableau d'associations entre index de partie source et
// (premiere) valeur de groupe source
oaAssociations.SetSize(oaSourceParts.GetSize());
for (nSource = 0; nSource < oaSourceParts.GetSize(); nSource++)
for (n = 0; n < ivRandomIndexes.GetSize(); n++)
{
// Attention, le RandomIndex est ici l'index de parcours du vecteur ivRandomIndexes
// Qui vient d'etre perturbe aleatoirement
nRandomIndex = n;
nSource = ivRandomIndexes.GetAt(nRandomIndex);
sourcePart = cast(KWDGPart*, oaSourceParts.GetAt(nSource));

// Recherche de la partie groupee correspondante
Expand All @@ -2956,12 +2968,11 @@ void KWDataGridManager::SortAttributeParts(KWDGAttribute* sourceAttribute, KWDGA
// Creation de l'association entre index de partie et premiere valeur du groupe
association = new KWSortableSymbol;
oaAssociations.SetAt(nSource, association);
association->SetIndex(nSource);
association->SetIndex(nRandomIndex);
association->SetSortValue(groupedPart->GetValueSet()->GetHeadValue()->GetValue());
}

// Tri des association, apres une randomisation pour avoir un ordre aleatoire par groupe
oaAssociations.Shuffle();
oaAssociations.SetCompareFunction(KWSortableSymbolCompareValue);
oaAssociations.Sort();

Expand All @@ -2974,7 +2985,8 @@ void KWDataGridManager::SortAttributeParts(KWDGAttribute* sourceAttribute, KWDGA
association = cast(KWSortableSymbol*, oaAssociations.GetAt(n));

// Recherche de la partie source
nSource = association->GetIndex();
nRandomIndex = association->GetIndex();
nSource = ivRandomIndexes.GetAt(nRandomIndex);
sourcePart = cast(KWDGPart*, oaSourceParts.GetAt(nSource));

// Recherche de la partie groupee correspondante
Expand Down
2 changes: 1 addition & 1 deletion src/Learning/KWDataPreparation/KWDataGridOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,7 @@ double KWDataGridOptimizer::OptimizeDataGrid(const KWDataGrid* initialDataGrid,
*/
{
// Cas d'amelioration du cout
if (dGranularityBestCost < dBestCost)
if (dGranularityBestCost < dBestCost - dEpsilon)
{
dBestCost = dGranularityBestCost;

Expand Down
2 changes: 1 addition & 1 deletion src/Learning/KWDataPreparation/KWDiscretizerMODL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ void KWDiscretizerMODL::Discretize(KWFrequencyTable* kwftSource, KWFrequencyTabl
delete kwftMergedTable;
kwftMergedTable = NULL;

if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;
// Destruction de l'optimum precedent
Expand Down
8 changes: 4 additions & 4 deletions src/Learning/KWDataPreparation/KWGrouperMODL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,7 @@ void KWGrouperMODL::GroupPreprocessedTable(KWFrequencyTable* kwftSource, KWFrequ
dCost = ComputeGroupingCost(kwftOptimizedGranularizedTable, nCurrentPartileNumber);

// Cas de l'amelioration du cout
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
// Memorisation du cout optimal
dBestCost = dCost;
Expand Down Expand Up @@ -813,7 +813,7 @@ void KWGrouperMODL::SmallSourceNumberGroup(KWFrequencyTable* kwftSource, KWFrequ
MergeFrequencyVectors(workingFrequencyVector, kwftSource->GetFrequencyVectorAt(1),
kwftSource->GetFrequencyVectorAt(2));
dCost = ComputeGroupCost(workingFrequencyVector) + dCostGroup0;
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;
nBestTwoGroupsIndex = 0;
Expand All @@ -822,7 +822,7 @@ void KWGrouperMODL::SmallSourceNumberGroup(KWFrequencyTable* kwftSource, KWFrequ
MergeFrequencyVectors(workingFrequencyVector, kwftSource->GetFrequencyVectorAt(0),
kwftSource->GetFrequencyVectorAt(2));
dCost = ComputeGroupCost(workingFrequencyVector) + dCostGroup1;
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;
nBestTwoGroupsIndex = 1;
Expand All @@ -831,7 +831,7 @@ void KWGrouperMODL::SmallSourceNumberGroup(KWFrequencyTable* kwftSource, KWFrequ
MergeFrequencyVectors(workingFrequencyVector, kwftSource->GetFrequencyVectorAt(0),
kwftSource->GetFrequencyVectorAt(1));
dCost = ComputeGroupCost(workingFrequencyVector) + dCostGroup2;
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;
nBestTwoGroupsIndex = 2;
Expand Down
7 changes: 4 additions & 3 deletions src/Learning/KWTest/Divers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1829,9 +1829,9 @@ void AnyCharFileGenerator()
char cChar;

FileService::OpenOutputFile(sFileName, fTest);
fTest
<< "Index\tChar\t<Char>"
"\tisupper\tislower\tisdigit\tisxdigit\tisalnum\tisspace\tispunct\tisprint\tisgraph\tiscntrl\tisascii\n";
fTest << "Index\tChar\t<Char>"
"\tisupper\tislower\tisdigit\tisxdigit\tisalnum\tisspace\tispunct\tp_"
"isprint\tisprint\tisgraph\tiscntrl\tisascii\n";
for (i = 0; i < 20; i++)
{
for (nChar = 1; nChar < 256; nChar++)
Expand Down Expand Up @@ -1861,6 +1861,7 @@ void AnyCharFileGenerator()
fTest << (isalnum(nChar) != 0) << "\t";
fTest << (isspace(nChar) != 0) << "\t";
fTest << (ispunct(nChar) != 0) << "\t";
fTest << (p_isprint(nChar) != 0) << "\t";
fTest << (isprint(nChar) != 0) << "\t";
fTest << (isgraph(nChar) != 0) << "\t";
fTest << (iscntrl(nChar) != 0) << "\t";
Expand Down
2 changes: 1 addition & 1 deletion src/Learning/KWTest/KWDensityEstimationTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ int KWDensityEstimationTest::SearchBestInstanceGridSize(ContinuousVector* cvXVal
cout << nAxisCellNumber << "\t" << dCost << endl;

// Test si amelioration
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;
nBestAxisCellNumber = nAxisCellNumber;
Expand Down
2 changes: 1 addition & 1 deletion src/Learning/KWTest/KWTextParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ void KWTextParser::BuildLineWordDictionary(char* sLine, ObjectDictionary* odWord
if (bInspectChars)
{
cout << i << "\t" << cLineChar << "\t" << isalnum(cLineChar) << "\t" << ispunct(cLineChar)
<< "\t" << isspace(cLineChar) << "\t" << isprint(cLineChar) << endl;
<< "\t" << isspace(cLineChar) << "\t" << p_isprint(cLineChar) << endl;
}

// Transformation des caracteres accentues
Expand Down
2 changes: 1 addition & 1 deletion src/Learning/MHHistograms/MHDiscretizerHistogramMODL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,7 @@ void MHDiscretizerHistogramMODL::GranularizedDiscretizeValues(const ContinuousVe
<< KWContinuous::ContinuousToString(dCost) << endl;

// Memorisation si amelioration du cout
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;
if (optimizedHistogramFrequencyTable != NULL)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -743,7 +743,7 @@ void MHDiscretizerHistogramMODL_fp::OptimizeGranularity(MHHistogram*& optimizedH
delete optimizedHistogramFrequencyTable;

// Memorisation si amelioration du cout
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2127,6 +2127,7 @@ int MHFloatingPointFrequencyTableBuilder::SearchBinIndex(int nSearchedCumulative
void MHFloatingPointFrequencyTableBuilder::InitializeDomainBounds()
{
boolean bDisplay = false;
const double dEpsilon = 1e-7;
int nTotalFrequency;
int i;
double dBestCost;
Expand Down Expand Up @@ -2172,13 +2173,13 @@ void MHFloatingPointFrequencyTableBuilder::InitializeDomainBounds()
cout << KWContinuous::ContinuousToString(cUpperBound - GetMaxValue()) << "\t";
cout << dCost << "\t";
cout << dBestCost << "\t";
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
cout << "Best";
cout << "\n";
}

// Memorisation si amelioration
if (dCost < dBestCost)
if (dCost < dBestCost - dEpsilon)
{
dBestCost = dCost;

Expand Down
Loading

0 comments on commit 3bc17ff

Please sign in to comment.