diff --git a/src/Learning/SNBPredictor/SNBDataTableBinarySliceSet.cpp b/src/Learning/SNBPredictor/SNBDataTableBinarySliceSet.cpp index ffe39c18b..1fb0ffc95 100644 --- a/src/Learning/SNBPredictor/SNBDataTableBinarySliceSet.cpp +++ b/src/Learning/SNBPredictor/SNBDataTableBinarySliceSet.cpp @@ -32,10 +32,6 @@ void SNBDataTableBinarySliceSetLayout::Initialize(int nSomeInstanceNumber, int n int nChunk; int nChunkInstanceNumber; int nChunkInstanceOffset; - LongintVector* lvChunkBlockSizes; - LongintVector* lvChunkBlockOffsets; - longint lBlockSize; - longint lBlockOffset; require(0 < nSomeChunkNumber and nSomeChunkNumber <= nSomeInstanceNumber); require(0 < nSomeSliceNumber and nSomeSliceNumber <= nSomeAttributeNumber); @@ -74,6 +70,7 @@ void SNBDataTableBinarySliceSetLayout::Initialize(int nSomeInstanceNumber, int n // - Les offsets des slices dans la liste d'attributs // - Le relation [index d'attribut -> index de sa slice] // - La relation [index d'attribut -> index d'attribut relatif a sa slice] + // - Les flag de sparse mode de chaque attribut ivSliceAttributeNumbers.SetSize(nSliceNumber); ivSliceAttributeOffsets.SetSize(nSliceNumber); ivAttributeSliceIndexes.SetSize(nAttributeNumber); @@ -98,28 +95,8 @@ void SNBDataTableBinarySliceSetLayout::Initialize(int nSomeInstanceNumber, int n ivSliceAttributeOffsets.SetAt(nSlice, nSliceAttributeOffset); nSliceAttributeOffset += nSliceAttributeNumber; } - - // Initialition des tailles et des offsets des blocs du chunk - oaBlockSizes.SetSize(nChunkNumber); - oaBlockOffsets.SetSize(nChunkNumber); - for (nChunk = 0; nChunk < nChunkNumber; nChunk++) - { - lvChunkBlockSizes = new LongintVector; - lvChunkBlockSizes->SetSize(nSliceNumber); - lvChunkBlockOffsets = new LongintVector; - lvChunkBlockOffsets->SetSize(nSliceNumber); - lBlockOffset = 0ll; - for (nSlice = 0; nSlice < nSliceNumber; nSlice++) - { - lBlockSize = - (longint)ivChunkInstanceNumbers.GetAt(nChunk) * ivSliceAttributeNumbers.GetAt(nSlice); - lvChunkBlockSizes->SetAt(nSlice, lBlockSize); - lvChunkBlockOffsets->SetAt(nSlice, lBlockOffset); - lBlockOffset += lvChunkBlockSizes->GetAt(nSlice) * sizeof(int); - } - oaBlockOffsets.SetAt(nChunk, lvChunkBlockOffsets); - oaBlockSizes.SetAt(nChunk, lvChunkBlockSizes); - } + ivAttributeSparseModes.SetSize(nAttributeNumber); + ivAttributeSparseModes.Initialize(); ensure(nChunkInstanceOffset == nInstanceNumber); ensure(nSliceAttributeOffset == nAttributeNumber); @@ -130,8 +107,7 @@ void SNBDataTableBinarySliceSetLayout::Initialize(int nSomeInstanceNumber, int n boolean SNBDataTableBinarySliceSetLayout::IsInitialized() const { return nAttributeNumber > 0 and nInstanceNumber > 0 and nSliceNumber > 0 and nChunkNumber > 0 and - ivChunkInstanceNumbers.GetSize() > 0 and ivSliceAttributeNumbers.GetSize() > 0 and - oaBlockOffsets.GetSize() > 0 and oaBlockSizes.GetSize() > 0; + ivChunkInstanceNumbers.GetSize() > 0 and ivSliceAttributeNumbers.GetSize() > 0; } void SNBDataTableBinarySliceSetLayout::CleanWorkingData() @@ -142,8 +118,6 @@ void SNBDataTableBinarySliceSetLayout::CleanWorkingData() nSliceNumber = -1; ivChunkInstanceNumbers.SetSize(0); ivSliceAttributeNumbers.SetSize(0); - oaBlockOffsets.DeleteAll(); - oaBlockSizes.DeleteAll(); ensure(not IsInitialized()); } @@ -230,30 +204,10 @@ int SNBDataTableBinarySliceSetLayout::GetRelativeIndexAtAttribute(int nAttribute return ivAttributeRelativeIndexes.GetAt(nAttribute); } -longint SNBDataTableBinarySliceSetLayout::GetBlockSizeAt(int nChunk, int nSlice) const -{ - LongintVector* lvChunkBlockSizes; - - require(IsInitialized()); - require(0 <= nChunk and nChunk < GetChunkNumber()); - require(0 <= nSlice and nSlice < GetSliceNumber()); - - lvChunkBlockSizes = cast(LongintVector*, oaBlockSizes.GetAt(nChunk)); - - return lvChunkBlockSizes->GetAt(nSlice); -} - -longint SNBDataTableBinarySliceSetLayout::GetBlockOffsetAt(int nChunk, int nSlice) const +boolean SNBDataTableBinarySliceSetLayout::IsAttributeSparseAt(int nAttribute) const { - LongintVector* lvChunkBlockOffsets; - require(IsInitialized()); - require(0 <= nChunk and nChunk < GetChunkNumber()); - require(0 <= nSlice and nSlice < GetSliceNumber()); - - lvChunkBlockOffsets = cast(LongintVector*, oaBlockOffsets.GetAt(nChunk)); - - return lvChunkBlockOffsets->GetAt(nSlice); + return (boolean)ivAttributeSparseModes.GetAt(nAttribute); } boolean SNBDataTableBinarySliceSetLayout::Check() const @@ -263,9 +217,6 @@ boolean SNBDataTableBinarySliceSetLayout::Check() const int nInstanceCount; int nSlice; int nAttributeCount; - longint lBlockOffset; - LongintVector* lvChunkBlockSizes; - LongintVector* lvChunkBlockOffsets; if (IsInitialized()) { @@ -312,21 +263,6 @@ boolean SNBDataTableBinarySliceSetLayout::Check() const ivSliceAttributeOffsets.GetAt(nSlice - 1) + ivSliceAttributeNumbers.GetAt(nSlice - 1) == ivSliceAttributeOffsets.GetAt(nSlice); } - - // Coherence des offset et tailles de blocs - for (nChunk = 0; nChunk < nChunkNumber; nChunk++) - { - lvChunkBlockSizes = cast(LongintVector*, oaBlockSizes.GetAt(nChunk)); - lvChunkBlockOffsets = cast(LongintVector*, oaBlockOffsets.GetAt(nChunk)); - lBlockOffset = 0; - for (nSlice = 0; nSlice < nSliceNumber; nSlice++) - { - bOk = bOk and lBlockOffset == lvChunkBlockOffsets->GetAt(nSlice); - lBlockOffset += (longint)lvChunkBlockSizes->GetAt(nSlice) * sizeof(int); - } - bOk = bOk and lBlockOffset == ((longint)ivChunkInstanceNumbers.GetAt(nChunk) * - (longint)nAttributeNumber * (longint)sizeof(int)); - } } else bOk = false; @@ -700,7 +636,6 @@ void SNBDataTableBinarySliceSetSchema::InitializeFromDataPreparationClass( odAttributesByRecodedAttributeName.SetAt( svUsableDataPreparationAttributeRecodedNames->GetAt(nAttribute), attribute); } - ensure(oaAttributes.GetSize() <= dataPreparationClass->GetDataPreparationAttributes()->GetSize()); ensure(IsInitialized() or oaAttributes.GetSize() == 0); ensure(Check()); @@ -1026,178 +961,475 @@ longint SNBDataTableBinarySliceSetRandomizedAttributeIterator::ComputeNecessaryM nMaxAttributesPerSlice * oaDummy.GetUsedMemoryPerElement()) + nAttributeNumber * oaDummy.GetUsedMemoryPerElement(); } - ///////////////////////////////////////////////////////////////////////////////////////////////////// -// Classe SNBDataTableBinarySliceSetBuffer +// Classe SNBDataTableBinarySliceSetColumn +// +SNBDataTableBinarySliceSetColumn::SNBDataTableBinarySliceSetColumn() +{ + bIsSparse = false; +} + +SNBDataTableBinarySliceSetColumn::~SNBDataTableBinarySliceSetColumn() {} -SNBDataTableBinarySliceSetBuffer::SNBDataTableBinarySliceSetBuffer() +void SNBDataTableBinarySliceSetColumn::SetSparse(boolean bSparseValue) { - layout = NULL; - // TODO FOR FELIPE: a supprimer - // buffer = NewIntArray(nIntBufferSize); - fChunkDataFile = NULL; - nOpenFileChunkIndex = -1; - nLoadedBlockChunkIndex = -1; - nLoadedBlockSliceIndex = -1; - nSingleInitializedChunkIndex = -1; - bIsSingleChunkBuffer = false; + bIsSparse = bSparseValue; } -SNBDataTableBinarySliceSetBuffer::~SNBDataTableBinarySliceSetBuffer() +boolean SNBDataTableBinarySliceSetColumn::IsSparse() const { - CleanWorkingData(); - // TODO FOR FELIPE: a supprimer - // DeleteIntArray(buffer); + return bIsSparse; +} + +int SNBDataTableBinarySliceSetColumn::GetValueNumber() const +{ + int nValueNumber; + + if (IsSparse()) + nValueNumber = GetDataSize() / 2; + else + nValueNumber = GetDataSize(); + + return nValueNumber; +} + +int SNBDataTableBinarySliceSetColumn::GetDenseValueAt(int nValueIndex) const +{ + require(not IsSparse()); + require(0 <= nValueIndex and nValueIndex < GetValueNumber()); + return ivData.GetAt(nValueIndex); +} + +int SNBDataTableBinarySliceSetColumn::GetSparseValueAt(int nValueIndex) const +{ + require(IsSparse()); + require(0 <= nValueIndex and nValueIndex < GetValueNumber()); + return ivData.GetAt(2 * nValueIndex + 1); } -void SNBDataTableBinarySliceSetBuffer::SetLayout(const SNBDataTableBinarySliceSetLayout* someLayout) +int SNBDataTableBinarySliceSetColumn::GetSparseValueInstanceIndexAt(int nValueIndex) const { + require(IsSparse()); + require(0 <= nValueIndex and nValueIndex < GetValueNumber()); + return ivData.GetAt(2 * nValueIndex); +} + +boolean SNBDataTableBinarySliceSetColumn::SetDataSize(int nSize) +{ + return ivData.SetLargeSize(nSize); +} + +int SNBDataTableBinarySliceSetColumn::GetDataSize() const +{ + return ivData.GetSize(); +} + +void SNBDataTableBinarySliceSetColumn::SetDataAt(int nDataIndex, int nDataValue) +{ + ivData.SetAt(nDataIndex, nDataValue); +} + +int SNBDataTableBinarySliceSetColumn::GetDataAt(int nDataIndex) const +{ + return ivData.GetAt(nDataIndex); +} + +void SNBDataTableBinarySliceSetColumn::AddData(int nDataValue) +{ + ivData.Add(nDataValue); +} + +boolean SNBDataTableBinarySliceSetColumn::Check() const +{ + boolean bOk = true; + int nData; + int nPreviousInstanceIndex; + + // En sparse il doit avoir une quantite pair de donnees + bOk = bOk and (not bIsSparse or ivData.GetSize() % 2 == 0); + + // Verification que toutes les donnees sont des indices positifs + for (nData = 0; nData < ivData.GetSize(); nData++) + bOk = bOk and ivData.GetAt(nData) >= 0; + + // En cas sparse: Verification que les indexes d'instance sont croisantes + if (bIsSparse and ivData.GetSize() > 0) + { + nPreviousInstanceIndex = ivData.GetAt(0); + for (nData = 2; nData < ivData.GetSize(); nData += 2) + { + bOk = bOk and nPreviousInstanceIndex < ivData.GetAt(nData); + if (not bOk) + break; + nPreviousInstanceIndex = ivData.GetAt(nData); + } + } + + return bOk; +} + +///////////////////////////////////////////////////////////////////////////////////////////////////// +// Classe SNBDataTableBinarySliceSetChunkPhysicalLayout +// +SNBDataTableBinarySliceSetChunkPhysicalLayout::SNBDataTableBinarySliceSetChunkPhysicalLayout() +{ + nChunkIndex = -1; + layout = NULL; +} + +SNBDataTableBinarySliceSetChunkPhysicalLayout::~SNBDataTableBinarySliceSetChunkPhysicalLayout() {} + +void SNBDataTableBinarySliceSetChunkPhysicalLayout::PartiallyInitialize( + int nChunk, const SNBDataTableBinarySliceSetLayout* someLayout) +{ + int nSlice; + int nAttribute; require(someLayout != NULL); require(someLayout->IsInitialized()); require(someLayout->Check()); + require(0 <= nChunk and nChunk < someLayout->GetChunkNumber()); - CleanWorkingData(); + // Parametrage de l'index du chunk et du layout + nChunkIndex = nChunk; layout = someLayout; - svChunkFilePaths.SetSize(layout->GetChunkNumber()); - svChunkFilePaths.Initialize(); + + // Initialization partielle des tableaux (bonne taille mais avec des indices invalides) + lvBlockSizes.SetSize(layout->GetSliceNumber()); + lvBlockOffsets.SetSize(layout->GetSliceNumber()); + for (nSlice = 0; nSlice < layout->GetSliceNumber(); nSlice++) + { + lvBlockSizes.SetAt(nSlice, -1); + lvBlockOffsets.SetAt(nSlice, -1); + } + ivAttributeDataSizes.SetSize(layout->GetAttributeNumber()); + for (nAttribute = 0; nAttribute < layout->GetAttributeNumber(); nAttribute++) + ivAttributeDataSizes.SetAt(nAttribute, -1); + + ensure(IsPartiallyInitialized()); } -const SNBDataTableBinarySliceSetLayout* SNBDataTableBinarySliceSetBuffer::GetLayout() const +boolean SNBDataTableBinarySliceSetChunkPhysicalLayout::IsPartiallyInitialized() const +{ + return nChunkIndex >= 0 and layout != NULL; +} + +const SNBDataTableBinarySliceSetLayout* SNBDataTableBinarySliceSetChunkPhysicalLayout::GetLayout() const { return layout; } -boolean SNBDataTableBinarySliceSetBuffer::Initialize(KWClass* recoderClass, KWDataTableSliceSet* sliceSet, - const SNBDataTableBinarySliceSetSchema* schema) +int SNBDataTableBinarySliceSetChunkPhysicalLayout::GetChunkIndex() const { - boolean bOk = true; - int nChunk; + return nChunkIndex; +} - require(layout != NULL); - require(layout->Check()); - require(recoderClass->Check()); - require(sliceSet->Check()); - require(schema != NULL); - require(schema->Check()); - require(schema->GetAttributeNumber() == layout->GetAttributeNumber()); +longint SNBDataTableBinarySliceSetChunkPhysicalLayout::GetBlockSizeAt(int nSlice) const +{ + require(IsInitialized()); + require(0 <= nSlice and nSlice < layout->GetSliceNumber()); + return lvBlockSizes.GetAt(nSlice); +} - // Ce buffer traite plusieurs chunks - bIsSingleChunkBuffer = false; +longint SNBDataTableBinarySliceSetChunkPhysicalLayout::GetBlockOffsetAt(int nSlice) const +{ + require(IsInitialized()); + require(0 <= nSlice and nSlice < layout->GetSliceNumber()); + return lvBlockOffsets.GetAt(nSlice); +} - // Annulation d'une eventuelle initialisation existante - CleanWorkingData(); +void SNBDataTableBinarySliceSetChunkPhysicalLayout::SetAttributeDataSizeAt(int nAttribute, int nAttributeDataSize) +{ + require(IsPartiallyInitialized()); + require(nAttributeDataSize >= 0); + require(0 <= nAttribute and nAttribute < layout->GetAttributeNumber()); + ivAttributeDataSizes.SetAt(nAttribute, nAttributeDataSize); +} + +int SNBDataTableBinarySliceSetChunkPhysicalLayout::GetAttributeDataSizeAt(int nAttribute) const +{ + require(IsInitialized()); + require(0 <= nAttribute and nAttribute < layout->GetAttributeNumber()); + return ivAttributeDataSizes.GetAt(nAttribute); +} - // Initialisation du buffer - bOk = bOk and InitializeBuffer(); +longint SNBDataTableBinarySliceSetChunkPhysicalLayout::GetDataSize() const +{ + longint lSize; + int nAttribute; - // Initialisation des fichiers - if (bOk) + require(IsInitialized()); + + lSize = 0; + for (nAttribute = 0; nAttribute < layout->GetAttributeNumber(); nAttribute++) + lSize += GetAttributeDataSizeAt(nAttribute); + + return lSize; +} + +void SNBDataTableBinarySliceSetChunkPhysicalLayout::FinishInitialization() +{ + longint lBlockOffset; + int nSlice; + longint lBlockSize; + longint lAttributeDataRelativeOffset; + int nSliceAttribute; + int nAttribute; + + require(IsPartiallyInitialized()); + + // En ajoutant les tailles de donnees de chaque attribute on calcule + // - La taille des blocs + // - Les offsets des blocs + // - Les offsets relatifs a la slices des offsets de donnees + // Toutes ces quantites se mesurent en nombre de `int`s + lBlockOffset = 0; + for (nSlice = 0; nSlice < layout->GetSliceNumber(); nSlice++) { - for (nChunk = 0; nChunk < layout->GetChunkNumber(); nChunk++) + lBlockSize = 0; + lAttributeDataRelativeOffset = 0; + for (nSliceAttribute = 0; nSliceAttribute < layout->GetAttributeNumberAtSlice(nSlice); + nSliceAttribute++) { - bOk = InitializeDataFileAtChunk(nChunk, recoderClass, sliceSet, schema); - if (not bOk) - break; + nAttribute = layout->GetAttributeOffsetAtSlice(nSlice) + nSliceAttribute; + lBlockSize += (longint)ivAttributeDataSizes.GetAt(nAttribute); + lAttributeDataRelativeOffset += (longint)ivAttributeDataSizes.GetAt(nAttribute); + } + lvBlockSizes.SetAt(nSlice, lBlockSize); + lvBlockOffsets.SetAt(nSlice, lBlockOffset); + lBlockOffset += lBlockSize; + } + ensure(IsInitialized()); +} + +boolean SNBDataTableBinarySliceSetChunkPhysicalLayout::IsInitialized() const +{ + boolean bIsInitialized = true; + int nSlice; + int nAttribute; + + // Verification du layout + bIsInitialized = bIsInitialized and layout != NULL; + bIsInitialized = bIsInitialized and layout->IsInitialized(); + + // Verfications que les tailles et offsets de blocs ne sont pas invalides + for (nSlice = 0; nSlice < layout->GetSliceNumber(); nSlice++) + { + bIsInitialized = bIsInitialized and lvBlockSizes.GetAt(nSlice) > 0; + bIsInitialized = bIsInitialized and lvBlockOffsets.GetAt(nSlice) >= 0; + } + + // Verification que les tailles et offsets des donneees d'attributs ne sont pas invalides + for (nAttribute = 0; nAttribute < layout->GetAttributeNumber(); nAttribute++) + bIsInitialized = bIsInitialized and ivAttributeDataSizes.GetAt(nAttribute) > 0; + + return bIsInitialized; +} + +void SNBDataTableBinarySliceSetChunkPhysicalLayout::Write(ostream& ost) const +{ + int nSlice; + int nAttribute; + + ost << "Physical Layout"; + if (not IsPartiallyInitialized()) + ost << " not initialized\n"; + else + { + ost << "Chunk #" << nChunkIndex << "\n"; + ost << "Slice Number: " << layout->GetSliceNumber() << "\n"; + + if (IsInitialized()) + { + ost << "Block Sizes: " << lvBlockSizes.GetAt(0); + for (nSlice = 1; nSlice < layout->GetSliceNumber(); nSlice++) + ost << ", " << lvBlockSizes.GetAt(nSlice); + ost << "\n"; + ost << "Block Offsets: " << lvBlockOffsets.GetAt(0); + for (nSlice = 1; nSlice < layout->GetSliceNumber(); nSlice++) + ost << ", " << lvBlockOffsets.GetAt(nSlice); + ost << "\n"; + } + + ost << "Attribute Number: " << layout->GetAttributeNumber() << "\n"; + if (IsInitialized()) + { + ost << "Attribute Data Sizes: " << ivAttributeDataSizes.GetAt(0); + for (nAttribute = 1; nAttribute < layout->GetAttributeNumber(); nAttribute++) + ost << ", " << ivAttributeDataSizes.GetAt(nAttribute); + ost << "\n"; + } + } + cout << "\n"; +} + +boolean SNBDataTableBinarySliceSetChunkPhysicalLayout::Check() const +{ + boolean bOk = true; + longint lCumulatedBlockSize; + longint lCumulatedAttributeDataSize; + int nSlice; + int nAttribute; + + // Verifications d'integrite lors d'une instance partiellement initialisee + if (IsPartiallyInitialized()) + { + bOk = bOk and layout != NULL; + bOk = bOk and layout->IsInitialized(); + bOk = bOk and layout->Check(); + bOk = bOk and (0 <= nChunkIndex and nChunkIndex < layout->GetChunkNumber()); + bOk = bOk and (lvBlockSizes.GetSize() == layout->GetSliceNumber()); + bOk = bOk and (lvBlockOffsets.GetSize() == layout->GetSliceNumber()); + bOk = bOk and (ivAttributeDataSizes.GetSize() == layout->GetAttributeNumber()); + } + + // Verifications d'integrite lors d'une instance intialisee + if (IsInitialized()) + { + // Verification de la quadrature des tailles des blocs et attributs avec les offsets des blocs + lCumulatedBlockSize = 0; + for (nSlice = 0; nSlice < layout->GetSliceNumber(); nSlice++) + { + lCumulatedAttributeDataSize = 0; + for (nAttribute = layout->GetAttributeOffsetAtSlice(nSlice); + nAttribute < + layout->GetAttributeOffsetAtSlice(nSlice) + layout->GetAttributeNumberAtSlice(nSlice); + nAttribute++) + { + lCumulatedAttributeDataSize += (longint)ivAttributeDataSizes.GetAt(nAttribute); + } + + bOk = bOk and (lCumulatedBlockSize == lvBlockOffsets.GetAt(nSlice)); + bOk = bOk and (lCumulatedAttributeDataSize == lvBlockSizes.GetAt(nSlice)); + lCumulatedBlockSize += lvBlockSizes.GetAt(nSlice); } } - ensure(not bOk or IsInitialized()); - ensure(not bOk or Check()); return bOk; } -boolean SNBDataTableBinarySliceSetBuffer::InitializeOnlyAtChunk(int nChunk, KWClass* recoderClass, - KWDataTableSliceSet* sliceSet, - SNBDataTableBinarySliceSetSchema* schema) +///////////////////////////////////////////////////////////////////////////////////////////////////// +// Classe SNBDataTableBinarySliceSetChunkBuffer + +SNBDataTableBinarySliceSetChunkBuffer::SNBDataTableBinarySliceSetChunkBuffer() +{ + nChunkIndex = -1; + layout = NULL; + // TODO FOR FELIPE: a supprimer + // buffer = NewIntArray(nIntBufferSize); + fChunkDataFile = NULL; + nLoadedBlockSliceIndex = -1; +} + +SNBDataTableBinarySliceSetChunkBuffer::~SNBDataTableBinarySliceSetChunkBuffer() +{ + CleanWorkingData(); + // TODO FOR FELIPE: a supprimer + // DeleteIntArray(buffer); +} + +void SNBDataTableBinarySliceSetChunkBuffer::SetLayout(const SNBDataTableBinarySliceSetLayout* someLayout) +{ + require(someLayout != NULL); + require(someLayout->IsInitialized()); + require(someLayout->Check()); + + CleanWorkingData(); + layout = someLayout; +} + +const SNBDataTableBinarySliceSetLayout* SNBDataTableBinarySliceSetChunkBuffer::GetLayout() const +{ + return layout; +} + +boolean SNBDataTableBinarySliceSetChunkBuffer::Initialize(int nChunk, KWClass* recoderClass, + KWDataTableSliceSet* sliceSet, + const SNBDataTableBinarySliceSetSchema* schema) { boolean bOk = true; require(layout != NULL); require(layout->Check()); + require(0 <= nChunk and nChunk < layout->GetChunkNumber()); require(recoderClass->Check()); - // require(sliceSet->Check()); + require(sliceSet->Check()); require(schema != NULL); require(schema->Check()); require(schema->GetAttributeNumber() == layout->GetAttributeNumber()); - // Annulation d'une eventuelle initialisation + // Annulation d'une eventuelle initialisation existante CleanWorkingData(); - // Ce buffer traite un seul chunk - bIsSingleChunkBuffer = true; - nSingleInitializedChunkIndex = nChunk; + // Parametrage de l'index du chunk + nChunkIndex = nChunk; + + // Initialization partielle du layout physique + // Elle continue a s'initialiser lors des appels a la methode InitializeBlockFromSliceSetAt + physicalLayout.PartiallyInitialize(nChunkIndex, layout); - // Initialization du buffer - bOk = bOk and InitializeBuffer(); + // Initialization des objets colonnes + InitializeColumns(); + // Initialisation des donnees if (bOk) { // Cas d'un seule slice: Un monte le seul bloc on memoire sans creer un fichier if (layout->GetSliceNumber() == 1) - { - bOk = LoadBlockFromSliceSetAt(nChunk, 0, recoderClass, sliceSet, schema); - nLoadedBlockChunkIndex = nChunk; - nLoadedBlockSliceIndex = 0; - } + bOk = InitializeBlockFromSliceSetAt(0, recoderClass, sliceSet, schema); // Cas de plusieurs slices: On initialise le fichier de chunk else - bOk = InitializeDataFileAtChunk(nChunk, recoderClass, sliceSet, schema); + bOk = InitializeDataFileFromSliceSetAt(recoderClass, sliceSet, schema); } - else - nSingleInitializedChunkIndex = -1; - ensure(not bOk or IsInitializedOnlyAtChunk(nChunk)); + // Finalisation de l'initialisation du layout physique + physicalLayout.FinishInitialization(); + + ensure(not bOk or IsInitialized()); ensure(not bOk or Check()); return bOk; } -boolean SNBDataTableBinarySliceSetBuffer::InitializeBuffer() +int SNBDataTableBinarySliceSetChunkBuffer::GetChunkIndex() const +{ + require(IsInitialized()); + return nChunkIndex; +} + +boolean SNBDataTableBinarySliceSetChunkBuffer::InitializeColumns() { boolean bOk = true; int nAttribute; - IntVector* ivAttributeIndexes; + SNBDataTableBinarySliceSetColumn* column; // Initialisation du tableau du bloc des donnees chargees oaLoadedBlock.SetSize(layout->GetMaxSliceAttributeNumber()); for (nAttribute = 0; nAttribute < oaLoadedBlock.GetSize(); nAttribute++) { - ivAttributeIndexes = new IntVector; - bOk = bOk and ivAttributeIndexes->SetLargeSize(layout->GetMaxChunkInstanceNumber()); - if (bOk) - oaLoadedBlock.SetAt(nAttribute, ivAttributeIndexes); - else - { - AddError("Could not allocate block vector"); - break; - } + column = new SNBDataTableBinarySliceSetColumn; + oaLoadedBlock.SetAt(nAttribute, column); } return bOk; } -boolean SNBDataTableBinarySliceSetBuffer::InitializeDataFileAtChunk(int nChunk, KWClass* recoderClass, - KWDataTableSliceSet* sliceSet, - const SNBDataTableBinarySliceSetSchema* schema) +boolean SNBDataTableBinarySliceSetChunkBuffer::InitializeDataFileFromSliceSetAt( + KWClass* recoderClass, KWDataTableSliceSet* sliceSet, const SNBDataTableBinarySliceSetSchema* schema) { boolean bOk = true; - ALString sChunkFilePath; int nChunkInstanceNumber; int nSlice; int nSliceAttributeNumber; int nBuffer; - int nInstance; + int nData; int nAttribute; - IntVector* ivAttributeIndexes; + SNBDataTableBinarySliceSetColumn* column; KWLoadIndexVector livLoadedRecodedAttributeIndexes; longint lValueNumber; int nRequestedBufferSize; int* buffer; int nIntBufferSize; - require(0 <= nChunk and nChunk < layout->GetChunkNumber()); - // Creation, indexation, verification de la taille et ouverture du fichier du chunk - sChunkFilePath = CreateTempFileForChunk(nChunk); - svChunkFilePaths.SetAt(nChunk, sChunkFilePath); - bOk = bOk and OpenOutputDataFileAtChunk(nChunk); + sChunkFilePath = CreateTempFileForChunk(nChunkIndex); + bOk = bOk and OpenOutputDataFile(); // Ecriture des indices des blocs correspondant au chunk if (bOk) @@ -1205,7 +1437,7 @@ boolean SNBDataTableBinarySliceSetBuffer::InitializeDataFileAtChunk(int nChunk, // TODO FOR FELIPE: a optimiser en fonction de la taille utile ici, et a adapter aux donnees sparse // Nombre total de valeurs du binary slice set lValueNumber = layout->GetInstanceNumber() * (longint)layout->GetAttributeNumber(); - nRequestedBufferSize = max((int)BufferedFile::nDefaultBufferSize, GetHugeBufferSize()); + nRequestedBufferSize = max((int)(8 * lMB), GetHugeBufferSize()); if (lValueNumber * sizeof(int) < nRequestedBufferSize) nRequestedBufferSize = int(lValueNumber * sizeof(int)); assert(nRequestedBufferSize % sizeof(int) == 0); @@ -1217,11 +1449,11 @@ boolean SNBDataTableBinarySliceSetBuffer::InitializeDataFileAtChunk(int nChunk, // Boucle sur les slices pour ecriture du fichier de chunk Global::ActivateErrorFlowControl(); - nChunkInstanceNumber = layout->GetInstanceNumberAtChunk(nChunk); + nChunkInstanceNumber = layout->GetInstanceNumberAtChunk(nChunkIndex); for (nSlice = 0; nSlice < layout->GetSliceNumber(); nSlice++) { // Chargmement en memoire de la slice du binary slice set depuis le slice set - bOk = bOk and LoadBlockFromSliceSetAt(nChunk, nSlice, recoderClass, sliceSet, schema); + bOk = bOk and InitializeBlockFromSliceSetAt(nSlice, recoderClass, sliceSet, schema); // Ecriture colonnaire (transposee) du bloc vers le fichier binaire du chunk if (bOk) @@ -1230,10 +1462,11 @@ boolean SNBDataTableBinarySliceSetBuffer::InitializeDataFileAtChunk(int nChunk, nSliceAttributeNumber = layout->GetAttributeNumberAtSlice(nSlice); for (nAttribute = 0; nAttribute < nSliceAttributeNumber; nAttribute++) { - ivAttributeIndexes = cast(IntVector*, oaLoadedBlock.GetAt(nAttribute)); - for (nInstance = 0; nInstance < nChunkInstanceNumber; nInstance++) + column = + cast(SNBDataTableBinarySliceSetColumn*, oaLoadedBlock.GetAt(nAttribute)); + for (nData = 0; nData < column->GetDataSize(); nData++) { - buffer[nBuffer] = ivAttributeIndexes->GetAt(nInstance); + buffer[nBuffer] = column->GetDataAt(nData); nBuffer = (nBuffer + 1) % nIntBufferSize; if (nBuffer == 0) { @@ -1270,35 +1503,37 @@ boolean SNBDataTableBinarySliceSetBuffer::InitializeDataFileAtChunk(int nChunk, // Cloture du fichier quoiqu'il arrive bOk = CloseOutputDataFile() and bOk; } - - ensure((bOk and IsInitializedAtChunk(nChunk) and CheckDataFileAtChunk(nChunk)) or not bOk); return bOk; } -boolean SNBDataTableBinarySliceSetBuffer::LoadBlockFromSliceSetAt(int nChunk, int nSlice, KWClass* recoderClass, - KWDataTableSliceSet* sliceSet, - const SNBDataTableBinarySliceSetSchema* schema) +boolean SNBDataTableBinarySliceSetChunkBuffer::InitializeBlockFromSliceSetAt( + int nSlice, KWClass* recoderClass, KWDataTableSliceSet* sliceSet, const SNBDataTableBinarySliceSetSchema* schema) { boolean bOk = true; KWLoadIndexVector livLoadedRecodedAttributeIndexes; int nSliceAttributeNumber; + int nSliceAttributeOffset; int nChunkInstanceNumber; int nSliceNumber; double dProgressPercent; double dProgressPercentSlice; - int nInstance; + int nChunkInstance; KWObject* kwoPartIndices; + int nSliceAttribute; int nAttribute; - IntVector* ivAttributeIndexes; + SNBDataTableBinarySliceSetColumn* column; Continuous cIndex; PeriodicTest periodicTestInterruption; + require(physicalLayout.IsPartiallyInitialized()); + // Chargement des attributs de la slice dans le dictionnaire de recodage LoadRecodedAttributesAtSlice(nSlice, recoderClass, schema, &livLoadedRecodedAttributeIndexes); // Alias locaux des bornes max (pour ne pas avoir de soucis de perf dans cette boucle critique) nSliceAttributeNumber = layout->GetAttributeNumberAtSlice(nSlice); - nChunkInstanceNumber = layout->GetInstanceNumberAtChunk(nChunk); + nSliceAttributeOffset = layout->GetAttributeOffsetAtSlice(nSlice); + nChunkInstanceNumber = layout->GetInstanceNumberAtChunk(nChunkIndex); nSliceNumber = layout->GetSliceNumber(); // Lecture des records du bloc depuis le slice set @@ -1307,46 +1542,84 @@ boolean SNBDataTableBinarySliceSetBuffer::LoadBlockFromSliceSetAt(int nChunk, in dProgressPercentSlice = 0.0; sliceSet->SetReadClass(recoderClass); sliceSet->OpenForRead(); - sliceSet->SkipMultiple(layout->GetInstanceOffsetAtChunk(nChunk)); - for (nInstance = 0; nInstance < nChunkInstanceNumber; nInstance++) - { - // Lecture de l'objet - kwoPartIndices = sliceSet->Read(); - bOk = bOk and kwoPartIndices != NULL; - if (not bOk) - break; + sliceSet->SkipMultiple(layout->GetInstanceOffsetAtChunk(nChunkIndex)); - // Trasposition des donnes de l'objet dans le buffer du bloc - for (nAttribute = 0; nAttribute < nSliceAttributeNumber; nAttribute++) + // Initialization des colonnes et son layout physique pour cette tranche + // - Cas dense : Allocation immediate de la memoire necessaire et saisie du layout physique + // - Cas sparse : L'allocation se fait au fil de l'eau car on ne connait pas sa taille a priori et + // le parametrage du layout physique se fait en fin de methode quand on a les infos + for (nSliceAttribute = 0; nSliceAttribute < nSliceAttributeNumber; nSliceAttribute++) + { + nAttribute = nSliceAttributeOffset + nSliceAttribute; + column = cast(SNBDataTableBinarySliceSetColumn*, oaLoadedBlock.GetAt(nSliceAttribute)); + column->SetSparse(layout->IsAttributeSparseAt(nAttribute)); + if (column->IsSparse()) + column->SetDataSize(0); + else { - ivAttributeIndexes = cast(IntVector*, oaLoadedBlock.GetAt(nAttribute)); - cIndex = - kwoPartIndices->GetContinuousValueAt(livLoadedRecodedAttributeIndexes.GetAt(nAttribute)); - ivAttributeIndexes->SetAt(nInstance, lrint(cIndex) - 1); + physicalLayout.SetAttributeDataSizeAt(nAttribute, + layout->GetInstanceNumberAtChunk(nChunkIndex)); + bOk = bOk and column->SetDataSize(layout->GetInstanceNumberAtChunk(nChunkIndex)); + if (not bOk) + { + AddError("Could not allocate block vector for column data"); + break; + } } - delete kwoPartIndices; - - // Suivi de la tache - // Ici est le seul endroit de cette classe qui change l'etat du TaskProgression. - // Il y a un leger retard de sa MAJ quand le schema a plusieurs slices et on utilise un fichier de - // chunk. Moralement, l'ecriture du fichier devrait etre aussi suivi mais cela entrainarait utiliser des - // etats en plus, des calculs plus tordus et une perte de localite de cette partie du code. La lecture - // du bloc depuis le fichier binaire est tres rapide et n'a pas besoin de suivi. - if (TaskProgression::IsInTask() and periodicTestInterruption.IsTestAllowed(nInstance)) - { - // Mise-a-jour de la barre d'avancement - dProgressPercentSlice = 100.0 * (nInstance + 1) / nChunkInstanceNumber; - if (layout->GetSliceNumber() == 1) - dProgressPercent = dProgressPercentSlice; - else - dProgressPercent = - 100.0 * nSlice / nSliceNumber + dProgressPercentSlice * (1.0 / nSliceNumber); - TaskProgression::DisplayProgression(int(dProgressPercent)); + if (bOk) + oaLoadedBlock.SetAt(nSliceAttribute, column); + } - // Verification d'une interruption utilisateur - bOk = bOk and not TaskProgression::IsInterruptionRequested(); + // Lecture des attributs de la slice courante depuis le KWDataTableSliceSet en entree + if (bOk) + { + for (nChunkInstance = 0; nChunkInstance < nChunkInstanceNumber; nChunkInstance++) + { + // Lecture de l'objet + kwoPartIndices = sliceSet->Read(); + bOk = bOk and (kwoPartIndices != NULL); if (not bOk) break; + + // Trasposition des donnes de l'objet dans aux colonne du bloc + for (nSliceAttribute = 0; nSliceAttribute < nSliceAttributeNumber; nSliceAttribute++) + { + nAttribute = nSliceAttributeOffset + nSliceAttribute; + column = cast(SNBDataTableBinarySliceSetColumn*, oaLoadedBlock.GetAt(nSliceAttribute)); + cIndex = kwoPartIndices->GetContinuousValueAt( + livLoadedRecodedAttributeIndexes.GetAt(nSliceAttribute)); + if (column->IsSparse()) + { + column->AddData(nChunkInstance); + column->AddData(lrint(cIndex) - 1); + } + else + column->SetDataAt(nChunkInstance, lrint(cIndex) - 1); + } + delete kwoPartIndices; + + // Suivi de la tache + // Ici est le seul endroit de *cette classe* qui change l'etat du TaskProgression. + // Il y a un leger retard de sa MAJ quand le schema a plusieurs slices et on utilise un fichier de chunk. + // Moralement, l'ecriture du fichier devrait etre aussi suivi mais cela entrainarait utiliser + // des etats en plus, des calculs plus tordus et une perte de localite de cette partie du code. + // La lecture du bloc depuis le fichier binaire est tres rapide et n'a pas besoin de suivi. + if (TaskProgression::IsInTask() and periodicTestInterruption.IsTestAllowed(nChunkInstance)) + { + // Mise-a-jour de la barre d'avancement + dProgressPercentSlice = 100.0 * (nChunkInstance + 1) / nChunkInstanceNumber; + if (layout->GetSliceNumber() == 1) + dProgressPercent = dProgressPercentSlice; + else + dProgressPercent = 100.0 * nSlice / nSliceNumber + + dProgressPercentSlice * (1.0 / nSliceNumber); + TaskProgression::DisplayProgression(int(dProgressPercent)); + + // Arret si il y a une interruption utilisateur + bOk = bOk and not TaskProgression::IsInterruptionRequested(); + if (not bOk) + break; + } } } Global::DesactivateErrorFlowControl(); @@ -1354,25 +1627,32 @@ boolean SNBDataTableBinarySliceSetBuffer::LoadBlockFromSliceSetAt(int nChunk, in // Fermeture du slice set quoi qu'il arrive bOk = sliceSet->Close() and bOk; - // Si succes : mise a jour des indices du bloc charge + // Mise-a-jour du layout physique pour les variables sparse de cette slice if (bOk) { - nLoadedBlockChunkIndex = nChunk; - nLoadedBlockSliceIndex = nSlice; + for (nSliceAttribute = 0; nSliceAttribute < nSliceAttributeNumber; nSliceAttribute++) + { + nAttribute = nSliceAttributeOffset + nSliceAttribute; + column = cast(SNBDataTableBinarySliceSetColumn*, oaLoadedBlock.GetAt(nSliceAttribute)); + if (column->IsSparse()) + physicalLayout.SetAttributeDataSizeAt(nAttribute, column->GetDataSize()); + } } + + // Si succes : mise a jour des indices du bloc charge + if (bOk) + nLoadedBlockSliceIndex = nSlice; // Si echec : mise a un etat invalide, pas message d'erreur // Rationale : // - Les messages d'erreur de bas niveau se font au niveau du KWDataTableSliceSet // - Les messages du niveau utilisateur se font au niveau du SNBSelectiveNaiveBayesTrainingTask else - { - nLoadedBlockChunkIndex = -1; nLoadedBlockSliceIndex = -1; - } + return bOk; } -const ALString SNBDataTableBinarySliceSetBuffer::CreateTempFileForChunk(int nChunk) const +const ALString SNBDataTableBinarySliceSetChunkBuffer::CreateTempFileForChunk(int nChunk) const { ALString sFileName; @@ -1383,27 +1663,22 @@ const ALString SNBDataTableBinarySliceSetBuffer::CreateTempFileForChunk(int nChu return FileService::CreateUniqueTmpFile(sFileName, this); } -boolean SNBDataTableBinarySliceSetBuffer::OpenOutputDataFileAtChunk(int nChunk) +boolean SNBDataTableBinarySliceSetChunkBuffer::OpenOutputDataFile() { boolean bOk; - ALString sChunkFilePath; require(fChunkDataFile == NULL); - require(nOpenFileChunkIndex == -1); - require(svChunkFilePaths.GetAt(nChunk) != ""); + require(sChunkFilePath != ""); - sChunkFilePath = svChunkFilePaths.GetAt(nChunk); bOk = FileService::OpenOutputBinaryFile(sChunkFilePath, fChunkDataFile); if (not bOk) fChunkDataFile = NULL; - else - nOpenFileChunkIndex = nChunk; - ensure(0 <= nOpenFileChunkIndex and nOpenFileChunkIndex < layout->GetChunkNumber()); + ensure(not bOk or fChunkDataFile != NULL); return bOk; } -boolean SNBDataTableBinarySliceSetBuffer::WriteToDataFile(int* writeBuffer, int nIntNumber) const +boolean SNBDataTableBinarySliceSetChunkBuffer::WriteToDataFile(int* writeBuffer, int nIntNumber) const { boolean bOk = true; int nWrittenIntNumber; @@ -1411,7 +1686,6 @@ boolean SNBDataTableBinarySliceSetBuffer::WriteToDataFile(int* writeBuffer, int require(writeBuffer != NULL); require(nIntNumber >= 0); require(fChunkDataFile != NULL); - require(0 <= nOpenFileChunkIndex and nOpenFileChunkIndex < layout->GetChunkNumber()); nWrittenIntNumber = (int)fwrite(writeBuffer, sizeof(int), nIntNumber, fChunkDataFile); bOk = (nWrittenIntNumber == nIntNumber); @@ -1419,24 +1693,20 @@ boolean SNBDataTableBinarySliceSetBuffer::WriteToDataFile(int* writeBuffer, int return bOk; } -boolean SNBDataTableBinarySliceSetBuffer::CloseOutputDataFile() +boolean SNBDataTableBinarySliceSetChunkBuffer::CloseOutputDataFile() { boolean bOk; - ALString sChunkFilePath; require(fChunkDataFile != NULL); - require(0 <= nOpenFileChunkIndex and nOpenFileChunkIndex < layout->GetChunkNumber()); - require(svChunkFilePaths.GetAt(nOpenFileChunkIndex) != ""); + require(sChunkFilePath != ""); - sChunkFilePath = svChunkFilePaths.GetAt(nOpenFileChunkIndex); bOk = FileService::CloseOutputBinaryFile(sChunkFilePath, fChunkDataFile); fChunkDataFile = NULL; - nOpenFileChunkIndex = -1; return bOk; } -void SNBDataTableBinarySliceSetBuffer::LoadRecodedAttributesAtSlice( +void SNBDataTableBinarySliceSetChunkBuffer::LoadRecodedAttributesAtSlice( int nSlice, KWClass* recoderClass, const SNBDataTableBinarySliceSetSchema* schema, KWLoadIndexVector* livLoadedRecodedAttributeIndexes) const { @@ -1474,377 +1744,362 @@ void SNBDataTableBinarySliceSetBuffer::LoadRecodedAttributesAtSlice( ensure(recoderClass->Check()); } -boolean SNBDataTableBinarySliceSetBuffer::IsInitialized() const +boolean SNBDataTableBinarySliceSetChunkBuffer::IsInitialized() const { boolean bOk = true; - int nChunk; bOk = bOk and layout != NULL; - bOk = bOk and layout->Check(); - if (bOk) { - bOk = bOk and svChunkFilePaths.GetSize() == layout->GetChunkNumber(); - - if (bIsSingleChunkBuffer) - bOk = bOk and IsInitializedOnlyAtChunk(nSingleInitializedChunkIndex); - - else - { - for (nChunk = 0; nChunk < layout->GetChunkNumber(); nChunk++) - bOk = bOk and IsInitializedAtChunk(nChunk); - } - } - - return bOk; -} - -boolean SNBDataTableBinarySliceSetBuffer::IsInitializedAtChunk(int nChunk) const -{ - boolean bOk = true; - - bOk = bOk and layout != NULL; - bOk = bOk and layout->Check(); - - if (bIsSingleChunkBuffer) - { - bOk = bOk and nChunk == nSingleInitializedChunkIndex; + bOk = bOk and layout->IsInitialized(); if (layout->GetSliceNumber() > 1) - bOk = bOk and svChunkFilePaths.GetAt(nChunk) != ""; + bOk = bOk and sChunkFilePath != ""; + bOk = bOk and physicalLayout.IsInitialized(); } - else - bOk = bOk and svChunkFilePaths.GetAt(nChunk) != ""; return bOk; } -boolean SNBDataTableBinarySliceSetBuffer::IsInitializedOnlyAtChunk(int nChunk) const -{ - return bIsSingleChunkBuffer and IsInitializedAtChunk(nChunk); -} - -void SNBDataTableBinarySliceSetBuffer::CleanWorkingData() +void SNBDataTableBinarySliceSetChunkBuffer::CleanWorkingData() { - int nChunk; - ALString sChunkFilePath; - - // Nettoyage des fichiers de chunk - for (nChunk = 0; nChunk < svChunkFilePaths.GetSize(); nChunk++) + // Elimination du fichier de chunk s'il existe + if (sChunkFilePath != "") { - sChunkFilePath = svChunkFilePaths.GetAt(nChunk); - if (sChunkFilePath != "") - FileService::RemoveFile(sChunkFilePath); + FileService::RemoveFile(sChunkFilePath); + sChunkFilePath = ""; } // Nettoyage des objets de travail + nChunkIndex = -1; oaLoadedBlock.DeleteAll(); - nLoadedBlockChunkIndex = -1; nLoadedBlockSliceIndex = -1; - bIsSingleChunkBuffer = false; - nSingleInitializedChunkIndex = -1; } -boolean SNBDataTableBinarySliceSetBuffer::CollectRecodedAttributeIndexes(int nChunk, int nAttribute, - IntVector*& ivOutput) +boolean +SNBDataTableBinarySliceSetChunkBuffer::CollectRecodedAttributeIndexes(int nAttribute, + SNBDataTableBinarySliceSetColumn*& outputColumn) { boolean bOk = true; int nAttributeSliceIndex; int nAttributeRelativeIndex; - require(IsInitializedAtChunk(nChunk)); + require(IsInitialized()); require(0 <= nAttribute and nAttribute < layout->GetAttributeNumber()); // Chargement du bloc depuis le disque si besoin nAttributeSliceIndex = layout->GetSliceIndexAtAttribute(nAttribute); - if (nChunk != nLoadedBlockChunkIndex or nAttributeSliceIndex != nLoadedBlockSliceIndex) - bOk = LoadBlockAt(nChunk, nAttributeSliceIndex); + if (nAttributeSliceIndex != nLoadedBlockSliceIndex) + bOk = LoadBlockAt(nAttributeSliceIndex); // On pointe le pointeur de sortie vers le vecteur d'indices de l'attribut if (bOk) { nAttributeRelativeIndex = layout->GetRelativeIndexAtAttribute(nAttribute); - ivOutput = cast(IntVector*, oaLoadedBlock.GetAt(nAttributeRelativeIndex)); + outputColumn = cast(SNBDataTableBinarySliceSetColumn*, oaLoadedBlock.GetAt(nAttributeRelativeIndex)); } - - ensure(ivOutput->GetSize() == layout->GetMaxChunkInstanceNumber()); return bOk; } -boolean SNBDataTableBinarySliceSetBuffer::LoadBlockAt(int nChunk, int nSlice) +boolean SNBDataTableBinarySliceSetChunkBuffer::LoadBlockAt(int nSlice) { boolean bOk = true; - int nInstance; + int nColumnData; int nAttribute; - longint lRemainingIndexNumber; - IntVector* ivAttributeIndexes; - longint lReadIndexesNumber; - int nIndex; - longint lValueNumber; int nRequestedBufferSize; int* buffer; int nIntBufferSize; - int nSizeToRead; + int nSliceAttribute; + SNBDataTableBinarySliceSetColumn* column; + longint lRemainingAttributeIntNumber; + longint lRemainingIndexNumber; + longint lReadIndexNumber; + int nBufferIndex; + int nToReadIndexNumber; - require(0 <= nChunk and nChunk < layout->GetChunkNumber()); - require(0 <= nSlice and nSlice < layout->GetSliceNumber()); - require(IsInitializedAtChunk(nChunk)); + require(IsInitialized()); require(fChunkDataFile == NULL); - require(Check()); + require(physicalLayout.IsInitialized()); + require(0 <= nSlice and nSlice < layout->GetSliceNumber()); // Verification d'integrite basique du fichier - bOk = bOk and CheckDataFileAtChunk(nChunk); + bOk = bOk and CheckChunkFile(); // Ouverture du fichier et positionement au debut du bloc - bOk = bOk and FileService::OpenInputBinaryFile(GetDataFilePathAtChunk(nChunk), fChunkDataFile); - bOk = bOk and FileService::SeekPositionInBinaryFile(fChunkDataFile, layout->GetBlockOffsetAt(nChunk, nSlice)); + bOk = bOk and FileService::OpenInputBinaryFile(sChunkFilePath, fChunkDataFile); + bOk = bOk and FileService::SeekPositionInBinaryFile(fChunkDataFile, + physicalLayout.GetBlockOffsetAt(nSlice) * sizeof(int)); + // Reinitialisation des colonnes du bloc if (bOk) { - // TODO FOR FELIPE: a optimiser en fonction de la taille utile ici, et a adapter aux donnees sparse - // Nombre total de valeurs du binary slice set - lValueNumber = layout->GetInstanceNumber() * (longint)layout->GetAttributeNumber(); - nRequestedBufferSize = max((int)BufferedFile::nDefaultBufferSize, GetHugeBufferSize()); - if (lValueNumber * sizeof(int) < nRequestedBufferSize) - nRequestedBufferSize = int(lValueNumber * sizeof(int)); + for (nSliceAttribute = 0; nSliceAttribute < layout->GetAttributeNumberAtSlice(nSlice); + nSliceAttribute++) + { + nAttribute = layout->GetAttributeOffsetAtSlice(nSlice) + nSliceAttribute; + column = cast(SNBDataTableBinarySliceSetColumn*, + oaLoadedBlock.GetAt(layout->GetRelativeIndexAtAttribute(nAttribute))); + column->SetSparse(layout->IsAttributeSparseAt(nAttribute)); + column->SetDataSize(physicalLayout.GetAttributeDataSizeAt(nAttribute)); + bOk = column->SetDataSize(physicalLayout.GetAttributeDataSizeAt(nAttribute)); + if (not bOk) + break; + } + } + + // Remplisement des colonnes depuis le fichier de chunk + if (bOk) + { + // Calcul de la taille du buffer de lecture + nRequestedBufferSize = max((int)(8 * lMB), GetHugeBufferSize()); + if (physicalLayout.GetDataSize() * sizeof(int) < nRequestedBufferSize) + nRequestedBufferSize = int(physicalLayout.GetDataSize() * sizeof(int)); assert(nRequestedBufferSize % sizeof(int) == 0); - // TODO FELIPE - // Demande d'un buffer de grande taille + // Demande d'un buffer de lecture de grande taille buffer = (int*)GetHugeBuffer(nRequestedBufferSize); nIntBufferSize = nRequestedBufferSize / sizeof(int); // Lecture par colonne du bloc defini par le chunk et la slice - nInstance = 0; - nAttribute = -1; - ivAttributeIndexes = NULL; - lRemainingIndexNumber = layout->GetBlockSizeAt(nChunk, nSlice); + nAttribute = layout->GetAttributeOffsetAtSlice(nSlice); + nSliceAttribute = 0; + nColumnData = 0; + lRemainingAttributeIntNumber = + physicalLayout.GetAttributeDataSizeAt(layout->GetAttributeOffsetAtSlice(nSlice)); + column = cast(SNBDataTableBinarySliceSetColumn*, oaLoadedBlock.GetAt(0)); + lRemainingIndexNumber = physicalLayout.GetBlockSizeAt(nSlice); while (lRemainingIndexNumber > 0 and not ferror(fChunkDataFile) and not feof(fChunkDataFile)) { // Lecture selon la taille necessaire, selon que l'on soit arrive ou non a la fin de la slice if (lRemainingIndexNumber >= nIntBufferSize) - nSizeToRead = nIntBufferSize; + nToReadIndexNumber = nIntBufferSize; else - nSizeToRead = (int)lRemainingIndexNumber; - lReadIndexesNumber = (longint)fread(buffer, sizeof(int), nSizeToRead, fChunkDataFile); - lRemainingIndexNumber -= lReadIndexesNumber; + nToReadIndexNumber = (int)lRemainingIndexNumber; + lReadIndexNumber = (longint)fread(buffer, sizeof(int), nToReadIndexNumber, fChunkDataFile); + lRemainingIndexNumber -= lReadIndexNumber; + assert(lReadIndexNumber == (longint)nToReadIndexNumber); - // Transfer des indexes vers les vecteurs du buffer - for (nIndex = 0; nIndex < lReadIndexesNumber; nIndex++) + // Transfer des indexes vers les colonnes du buffer + for (nBufferIndex = 0; nBufferIndex < lReadIndexNumber; nBufferIndex++) { - if (nInstance == 0) + // Si pas de donnees restants pour la colonne: Initialisation de la suivante + if (lRemainingAttributeIntNumber == 0) { nAttribute++; - ivAttributeIndexes = cast(IntVector*, oaLoadedBlock.GetAt(nAttribute)); + nSliceAttribute++; + nColumnData = 0; + lRemainingAttributeIntNumber = + physicalLayout.GetAttributeDataSizeAt(nAttribute); + column = cast(SNBDataTableBinarySliceSetColumn*, + oaLoadedBlock.GetAt(nSliceAttribute)); } - ivAttributeIndexes->SetAt(nInstance, buffer[nIndex]); - nInstance = (nInstance + 1) % layout->GetInstanceNumberAtChunk(nChunk); + + // Transfer de la prochaine donnee vers sa colonne + column->SetDataAt(nColumnData, buffer[nBufferIndex]); + lRemainingAttributeIntNumber--; + nColumnData++; } } - // Verification que on a lu le nombre exacte d'indices & qu'il n'y pas eu d'errer de lecture + // Verification que on a lu le nombre exacte d'indices & qu'il n'y pas eu d'erreur de lecture + bOk = bOk and lRemainingAttributeIntNumber == 0; bOk = bOk and lRemainingIndexNumber == 0; bOk = bOk and not ferror(fChunkDataFile); if (not bOk and not ferror(fChunkDataFile)) - AddError("Corrupted binary data file " + GetDataFilePathAtChunk(nChunk)); + AddError("Corrupted binary data file " + sChunkFilePath); else if (ferror(fChunkDataFile)) - AddError("IO error when reading binary data file " + GetDataFilePathAtChunk(nChunk)); + AddError("IO error when reading binary data file " + sChunkFilePath); // Cloture du fichier quoiqu'il arrive - bOk = FileService::CloseInputBinaryFile(GetDataFilePathAtChunk(nChunk), fChunkDataFile) and bOk; + bOk = FileService::CloseInputBinaryFile(sChunkFilePath, fChunkDataFile) and bOk; fChunkDataFile = NULL; } - // Si lecture OK : Mise a jour des indexes de chunk et slice + // Si lecture OK : Mise a jour de l'index de la slice du bloc en memoire if (bOk) - { - nLoadedBlockChunkIndex = nChunk; nLoadedBlockSliceIndex = nSlice; - } // Si lecture KO : mise a un etat invalide else - { - nLoadedBlockChunkIndex = -1; nLoadedBlockSliceIndex = -1; - } + ensure(fChunkDataFile == NULL); + ensure(Check()); return bOk; } -ALString SNBDataTableBinarySliceSetBuffer::GetDataFilePathAtChunk(int nChunk) const -{ - require(IsInitializedAtChunk(nChunk)); - return svChunkFilePaths.GetAt(nChunk); -} - -boolean SNBDataTableBinarySliceSetBuffer::Check() const +boolean SNBDataTableBinarySliceSetChunkBuffer::Check() const { boolean bOk = true; - int nChunk; int nAttribute; - IntVector* ivAttributeIndexes; + SNBDataTableBinarySliceSetColumn* column; // Verification du layout bOk = bOk and layout != NULL; bOk = bOk and layout->Check(); + bOk = bOk and physicalLayout.Check(); // Verification du bloc charge en memoire - bOk = bOk and 0 <= nLoadedBlockChunkIndex and nLoadedBlockChunkIndex < layout->GetChunkNumber(); bOk = bOk and 0 <= nLoadedBlockSliceIndex and nLoadedBlockSliceIndex < layout->GetSliceNumber(); - if (bIsSingleChunkBuffer) - bOk = bOk and nLoadedBlockChunkIndex == nSingleInitializedChunkIndex; if (bOk) { bOk = bOk and oaLoadedBlock.GetSize() == layout->GetMaxSliceAttributeNumber(); for (nAttribute = 0; nAttribute < oaLoadedBlock.GetSize(); nAttribute++) { - ivAttributeIndexes = cast(IntVector*, oaLoadedBlock.GetAt(nAttribute)); - bOk = bOk and ivAttributeIndexes->GetSize() == layout->GetMaxChunkInstanceNumber(); + column = cast(SNBDataTableBinarySliceSetColumn*, oaLoadedBlock.GetAt(nAttribute)); + if (not column->IsSparse()) + bOk = bOk and column->GetDataSize() == layout->GetInstanceNumberAtChunk(nChunkIndex); + bOk = bOk and column->Check(); } } // Verification des fichiers de chunk if (bOk) { - // Cas de chunk unique - if (bIsSingleChunkBuffer) + // Cas de slice unique: Aucun fichie doit etre initialize + if (layout->GetSliceNumber() == 1) { - // Cas de slice unique: Aucun fichie doit etre initialize - if (layout->GetSliceNumber() == 1) - { - bOk = bOk and nLoadedBlockSliceIndex == 0; - for (nChunk = 0; nChunk < layout->GetChunkNumber(); nChunk++) - bOk = bOk and svChunkFilePaths.GetAt(nChunk) == ""; - } - // Cas de plusieurs slices: Verification du fichier de chunk - else - { - bOk = bOk and CheckDataFileAtChunk(nSingleInitializedChunkIndex); - for (nChunk = 0; nChunk < layout->GetChunkNumber(); nChunk++) - { - if (nChunk != nSingleInitializedChunkIndex) - bOk = bOk and svChunkFilePaths.GetAt(nChunk) == ""; - } - } + bOk = bOk and nLoadedBlockSliceIndex == 0; + bOk = bOk and sChunkFilePath == ""; } - // Cas plusieurs chunks : Verification de tous les fichiers de chunk + // Cas de plusieurs slices: Verification du fichier de chunk else { - for (nChunk = 0; nChunk < layout->GetChunkNumber(); nChunk++) - bOk = bOk and CheckDataFileAtChunk(nChunk); + bOk = bOk and sChunkFilePath != ""; + bOk = bOk and CheckChunkFile(); } } return bOk; } -boolean SNBDataTableBinarySliceSetBuffer::CheckDataFileAtChunk(int nChunk) const +boolean SNBDataTableBinarySliceSetChunkBuffer::CheckChunkFile() const { boolean bOk = true; longint lDataFileSize; longint lExpectedDataFileSize; + int nSlice; ALString sTmp; - ALString sDataFilePath; - - bOk = bOk and (0 <= nChunk and nChunk < layout->GetChunkNumber()); - if (not bOk) - AddError(sTmp + "Chunk index " + IntToString(nChunk) + " out of range"); if (bOk) { - sDataFilePath = svChunkFilePaths.GetAt(nChunk); - bOk = bOk and (sDataFilePath != ""); + bOk = bOk and (sChunkFilePath != ""); if (not bOk) - AddError(sTmp + "Chunk no." + IntToString(nChunk) + " has no associated chunk file path"); + AddError(sTmp + "Chunk no." + IntToString(nChunkIndex) + " has no associated chunk file path"); } if (bOk) { - lDataFileSize = FileService::GetFileSize(sDataFilePath); - lExpectedDataFileSize = - (longint)layout->GetInstanceNumberAtChunk(nChunk) * layout->GetAttributeNumber() * sizeof(int); + lDataFileSize = FileService::GetFileSize(sChunkFilePath); + lExpectedDataFileSize = 0; + for (nSlice = 0; nSlice < layout->GetSliceNumber(); nSlice++) + lExpectedDataFileSize += sizeof(int) * physicalLayout.GetBlockSizeAt(nSlice); + bOk = bOk and (lDataFileSize == lExpectedDataFileSize); if (not bOk) - AddError("Data file " + sDataFilePath + " has size " + LongintToString(lDataFileSize) + + AddError("Data file " + sChunkFilePath + " has size " + LongintToString(lDataFileSize) + " bytes " + "instead of the expected " + LongintToString(lExpectedDataFileSize) + " bytes"); } return bOk; } -const ALString SNBDataTableBinarySliceSetBuffer::GetClassLabel() const +const ALString SNBDataTableBinarySliceSetChunkBuffer::GetClassLabel() const { return "Data Table Binary Slice Set Buffer"; } -void SNBDataTableBinarySliceSetBuffer::WriteContentsAsTSV(ostream& ost, const SNBDataTableBinarySliceSetSchema* schema) +void SNBDataTableBinarySliceSetChunkBuffer::WriteContentsAsTSV(ostream& ost, + const SNBDataTableBinarySliceSetSchema* schema) { + const int nMaxInstanceNumber = 200; + int nDisplayedInstanceNumber; boolean bOk = true; - int nChunk; int nSlice; int nInstance; int nAttribute; int nChunkInstance; - IntVector* ivRecodedAttributeIndexes; + SNBDataTableBinarySliceSetColumn* column; + IntVector ivCurrentValueIndexes; if (layout == NULL) ost << "No layout defined\n"; else { - for (nChunk = 0; nChunk < layout->GetChunkNumber(); nChunk++) + if (IsInitialized()) { - ost << "Chunk no." << nChunk; - if (IsInitializedAtChunk(nChunk)) + // Entete du bloc + ost << "Chunk #" << nChunkIndex << ": Initialized\n"; + if (sChunkFilePath != "") + ost << "File: " << FileService::GetURIFilePathName(sChunkFilePath) << "\n"; + else + ost << "File: None (no slicing)\n"; + ost << physicalLayout << "\n"; + ivCurrentValueIndexes.SetSize(layout->GetAttributeNumber()); + for (nAttribute = 0; nAttribute < layout->GetAttributeNumber(); nAttribute++) + ivCurrentValueIndexes.SetAt(nAttribute, -1); + for (nSlice = 0; nSlice < layout->GetSliceNumber(); nSlice++) { - // Entete du bloc - ost << ": Initialized\n"; - ost << "File: " << FileService::GetURIFilePathName(GetDataFilePathAtChunk(nChunk)) - << "\n"; - ost << "Offsets: "; - ost << layout->GetBlockOffsetAt(nChunk, 0); - for (nSlice = 1; nSlice < layout->GetSliceNumber(); nSlice++) - ost << ", " << layout->GetBlockOffsetAt(nChunk, nSlice); + nInstance = layout->GetInstanceOffsetAtChunk(nChunkIndex); + ost << "chunk\tslice\tind\tchind"; + for (nAttribute = layout->GetAttributeOffsetAtSlice(nSlice); + nAttribute < layout->GetAttributeOffsetAtSlice(nSlice) + + layout->GetAttributeNumberAtSlice(nSlice); + nAttribute++) + { + ost << "\t" << schema->GetAttributeAt(nAttribute)->GetNativeAttributeName(); + if (layout->IsAttributeSparseAt(nAttribute)) + ost << "[S]"; + } ost << "\n"; - for (nSlice = 0; nSlice < layout->GetSliceNumber(); nSlice++) + + // Ecriture des contenus du fichier + nDisplayedInstanceNumber = + min(nMaxInstanceNumber, layout->GetInstanceNumberAtChunk(nChunkIndex)); + for (nChunkInstance = 0; nChunkInstance < nDisplayedInstanceNumber; nChunkInstance++) { - nInstance = layout->GetInstanceOffsetAtChunk(nChunk); - ost << "chunk\tslice\tind"; + ost << nChunkIndex << "\t" << nSlice << "\t" << nInstance << "\t" + << nChunkInstance; for (nAttribute = layout->GetAttributeOffsetAtSlice(nSlice); nAttribute < layout->GetAttributeOffsetAtSlice(nSlice) + layout->GetAttributeNumberAtSlice(nSlice); nAttribute++) { - ost << "\t" - << schema->GetAttributeAt(nAttribute)->GetNativeAttributeName(); - } - ost << "\n"; + bOk = CollectRecodedAttributeIndexes(nAttribute, column); + if (column->IsSparse()) + { + ost << "\t"; + if (ivCurrentValueIndexes.GetAt(nAttribute) == -1) + ivCurrentValueIndexes.SetAt(nAttribute, 0); - // Ecriture des contenus du fichier - for (nChunkInstance = 0; - nChunkInstance < layout->GetInstanceNumberAtChunk(nChunk); - nChunkInstance++) - { - ost << nChunk << "\t" << nSlice << "\t" << nInstance; - for (nAttribute = layout->GetAttributeOffsetAtSlice(nSlice); - nAttribute < layout->GetAttributeOffsetAtSlice(nSlice) + - layout->GetAttributeNumberAtSlice(nSlice); - nAttribute++) + else if (ivCurrentValueIndexes.GetAt(nAttribute) == + column->GetValueNumber()) + continue; + + if (column->GetSparseValueInstanceIndexAt( + ivCurrentValueIndexes.GetAt(nAttribute)) == + nChunkInstance) + { + ost << column->GetSparseValueAt( + ivCurrentValueIndexes.GetAt(nAttribute)); + ivCurrentValueIndexes.UpgradeAt(nAttribute, 1); + } + } + else { - bOk = CollectRecodedAttributeIndexes(nChunk, nAttribute, - ivRecodedAttributeIndexes); - ost << "\t" << ivRecodedAttributeIndexes->GetAt(nChunkInstance); + ivCurrentValueIndexes.SetAt(nAttribute, nChunkInstance); + ost << "\t" << column->GetDenseValueAt(nChunkInstance); } - ost << "\n"; - nInstance++; } + ost << endl; + nInstance++; } } - else - ost << ": Not initialized\n"; } + else + ost << ": Not initialized\n"; } } -longint SNBDataTableBinarySliceSetBuffer::ComputeNecessaryMemory(int nInstanceNumber, int nChunkNumber, - int nAttributeNumber, int nSliceNumber) +longint SNBDataTableBinarySliceSetChunkBuffer::ComputeNecessaryMemory(int nInstanceNumber, int nChunkNumber, + int nAttributeNumber, int nSliceNumber) { ObjectArray oaDummy; @@ -1861,11 +2116,9 @@ longint SNBDataTableBinarySliceSetBuffer::ComputeNecessaryMemory(int nInstanceNu // Objets de la instance + // FILE de lecture/ecriture (fChunkDataFile) + // Tableau de MemSegmentByteSize int's (buffer) + - // Vecteur de nAttributes KWLoadIndex's (livLoadedRecodedAttributeIndexes dans InitializeDataFileAtChunk) + + // Vecteur de nAttributes KWLoadIndex's (livLoadedRecodedAttributeIndexes dans InitializeDataFileFromSliceSetAt) + // Tableau de nMaxAttributesPerSlice IntVectors de taille nMaxInstancesPerChunk (oaLoadedBlock) - return sizeof(SNBDataTableBinarySliceSetBuffer) + sizeof(FILE) + - // TODO FOR FELIPE: a supprimer - //(longint)SNBDataTableBinarySliceSetBuffer::nIntBufferSize * sizeof(int) + + return sizeof(SNBDataTableBinarySliceSetChunkBuffer) + sizeof(FILE) + (longint)nAttributeNumber * sizeof(KWLoadIndex) + nMaxAttributesPerSlice * (oaDummy.GetUsedMemoryPerElement() + sizeof(IntVector) + (longint)nMaxInstancesPerChunk * sizeof(int)); @@ -1879,7 +2132,6 @@ SNBDataTableBinarySliceSet::SNBDataTableBinarySliceSet() nInitialAttributeNumber = -1; bIsError = false; dataPreparationClass = NULL; - nInitializedChunkIndex = -1; } SNBDataTableBinarySliceSet::~SNBDataTableBinarySliceSet() @@ -1887,11 +2139,13 @@ SNBDataTableBinarySliceSet::~SNBDataTableBinarySliceSet() CleanWorkingData(true); } -void SNBDataTableBinarySliceSet::Initialize(KWClassStats* classStats, int nChunkNumber, int nMaxAttributes, - int nSliceNumber) +void SNBDataTableBinarySliceSet::InitializeWithoutChunkBuffer(KWClassStats* classStats, int nChunkNumber, + int nMaxAttributes, int nSliceNumber) { + // Initialisation du schema d'attributs InitializeSchemaFromClassStats(classStats, nMaxAttributes); + // S'il y a des attributs informatifs on pursuit l'initialisation if (HasUsableAttributes()) { // Initialisation des indices des valeurs de la cible @@ -1903,11 +2157,6 @@ void SNBDataTableBinarySliceSet::Initialize(KWClassStats* classStats, int nChunk // Initialisation de l'iterateur randomise d'attributes randomizedAttributeIterator.Initialize(&schema, &layout); - - // Initialisation du buffer - dataBuffer.SetLayout(&layout); - dataBuffer.Initialize(dataPreparationClass->GetDataPreparationClass(), - classStats->GetDataTableSliceSet(), &schema); } } @@ -1941,7 +2190,7 @@ void SNBDataTableBinarySliceSet::InitializeSchemaFromClassStats(KWClassStats* cl oaSortedAttributes.SetCompareFunction(KWDataPreparationAttributeCompareSortValue); oaSortedAttributes.Sort(); - // Calcul du nombre maximal effectif d'attribus pour l'apprentissage + // Calcul du nombre maximal effectif d'attributs pour l'apprentissage nInitialAttributeNumber = dataPreparationClass->GetDataPreparationAttributes()->GetSize(); if (nMaxAttributes > 0) nEffectiveMaxAttributes = nMaxAttributes; @@ -1977,29 +2226,15 @@ void SNBDataTableBinarySliceSet::InitializeSchemaFromClassStats(KWClassStats* cl ensure(Check()); } -boolean SNBDataTableBinarySliceSet::InitializeAllChunks(KWDataTableSliceSet* sliceSet) +boolean SNBDataTableBinarySliceSet::InitializeBufferAtChunk(int nChunk, KWDataTableSliceSet* sliceSet) { boolean bOk; require(dataPreparationClass != NULL); require(layout.IsInitialized()); require(schema.IsInitialized()); - dataBuffer.SetLayout(&layout); - bOk = dataBuffer.Initialize(dataPreparationClass->GetDataPreparationClass(), sliceSet, &schema); - - return bOk; -} - -boolean SNBDataTableBinarySliceSet::InitializeBufferOnlyAtChunk(int nChunk, KWDataTableSliceSet* sliceSet) -{ - boolean bOk; - require(dataPreparationClass != NULL); - require(layout.IsInitialized()); - require(schema.IsInitialized()); - - dataBuffer.SetLayout(&layout); - bOk = dataBuffer.InitializeOnlyAtChunk(nChunk, dataPreparationClass->GetDataPreparationClass(), sliceSet, - &schema); + chunkBuffer.SetLayout(&layout); + bOk = chunkBuffer.Initialize(nChunk, dataPreparationClass->GetDataPreparationClass(), sliceSet, &schema); return bOk; } @@ -2131,22 +2366,16 @@ int SNBDataTableBinarySliceSet::ComputeContinuousTargetValuePartIndex(Continuous return nIndex; } -boolean SNBDataTableBinarySliceSet::IsInitialized() const +boolean SNBDataTableBinarySliceSet::IsInitializedWithoutBuffer() const { - return layout.IsInitialized() and schema.IsInitialized() and dataBuffer.IsInitialized() and - ivTargetValueIndexes.GetSize() > 0 and nInitialAttributeNumber > 0; + return layout.IsInitialized() and schema.IsInitialized() and ivTargetValueIndexes.GetSize() > 0 and + nInitialAttributeNumber > 0 and not chunkBuffer.IsInitialized(); } -boolean SNBDataTableBinarySliceSet::IsPartiallyInitialized() const +boolean SNBDataTableBinarySliceSet::IsInitialized() const { - boolean bIsBufferPartiallyInitialized; - - bIsBufferPartiallyInitialized = - dataBuffer.IsInitialized() or - (nInitializedChunkIndex >= 0 and dataBuffer.IsInitializedAtChunk(nInitializedChunkIndex)); - - return layout.IsInitialized() and schema.IsInitialized() and bIsBufferPartiallyInitialized and - ivTargetValueIndexes.GetSize() > 0 and nInitialAttributeNumber > 0; + return layout.IsInitialized() and schema.IsInitialized() and ivTargetValueIndexes.GetSize() > 0 and + nInitialAttributeNumber > 0 and chunkBuffer.IsInitialized(); } boolean SNBDataTableBinarySliceSet::HasUsableAttributes() const @@ -2158,7 +2387,7 @@ void SNBDataTableBinarySliceSet::CleanWorkingData(boolean bDeleteDataPreparation { layout.CleanWorkingData(); schema.CleanWorkingData(); - dataBuffer.CleanWorkingData(); + chunkBuffer.CleanWorkingData(); randomizedAttributeIterator.CleanWorkingData(); ivTargetValueIndexes.SetSize(0); @@ -2261,16 +2490,8 @@ int SNBDataTableBinarySliceSet::GetInstanceNumber() const int SNBDataTableBinarySliceSet::GetActiveInstanceNumber() const { - int nEffectiveInstanceNumber; - - require(layout.IsInitialized()); - - if (nInitializedChunkIndex >= 0) - nEffectiveInstanceNumber = layout.GetInstanceNumberAtChunk(nInitializedChunkIndex); - else - nEffectiveInstanceNumber = layout.GetInstanceNumber(); - - return nEffectiveInstanceNumber; + require(IsInitialized()); + return layout.GetInstanceNumberAtChunk(chunkBuffer.GetChunkIndex()); } int SNBDataTableBinarySliceSet::GetTargetValueIndexAt(int nInstance) const @@ -2282,15 +2503,8 @@ int SNBDataTableBinarySliceSet::GetTargetValueIndexAt(int nInstance) const int SNBDataTableBinarySliceSet::GetTargetValueIndexAtActiveInstance(int nActiveInstance) const { - int nTargetValue; - - if (nInitializedChunkIndex >= 0) - nTargetValue = - GetTargetValueIndexAt(layout.GetInstanceOffsetAtChunk(nInitializedChunkIndex) + nActiveInstance); - else - nTargetValue = GetTargetValueIndexAt(nActiveInstance); - - return nTargetValue; + require(IsInitialized()); + return GetTargetValueIndexAt(layout.GetInstanceOffsetAtChunk(chunkBuffer.GetChunkIndex()) + nActiveInstance); } void SNBDataTableBinarySliceSet::ExportTargetPartFrequencies(const SNBDataTableBinarySliceSetAttribute* attribute, @@ -2311,42 +2525,47 @@ boolean SNBDataTableBinarySliceSet::UpdateTargetValueScores(SNBDataTableBinarySl Continuous cDeltaWeight, ContinuousVector* cvScores) { boolean bOk = true; - IntVector* ivRecodedAttributeIndexes; - int nInstance; - int nChunk; + SNBDataTableBinarySliceSetColumn* column; int nChunkInstanceNumber; int nChunkInstance; Continuous cLogProb; + int nColumnValueIndex; + int nColumnValueNumber; + require(IsInitialized()); require(cvScores != NULL); require(cvScores->GetSize() == GetActiveInstanceNumber()); - require(IsPartiallyInitialized()); require(ContainsAttribute(attribute)); - // Mise a jour des log-probabilites de la cible dans le vecteur de sortie - ivRecodedAttributeIndexes = NULL; - nInstance = 0; - for (nChunk = 0; nChunk < layout.GetChunkNumber(); nChunk++) + // Lecture des indexes de recodage pour l'attribut et chunk courant + column = NULL; + bOk = chunkBuffer.CollectRecodedAttributeIndexes(attribute->GetIndex(), column); + if (not bOk) + bIsError = true; + if (bOk) { - if (dataBuffer.IsInitializedAtChunk(nChunk)) + // Mise a jour des log-probabilites de la cible dans le vecteur de sortie + // Cas sparse : On ne met a jour que les scores des instances ou l'attribut n'est pas manquante + if (column->IsSparse()) { - // Lecture des indexes de recodage pour l'attribut et chunk courant - bOk = dataBuffer.CollectRecodedAttributeIndexes(nChunk, attribute->GetIndex(), - ivRecodedAttributeIndexes); - if (not bOk) + nColumnValueNumber = column->GetValueNumber(); + for (nColumnValueIndex = 0; nColumnValueIndex < nColumnValueNumber; nColumnValueIndex++) { - bIsError = true; - break; + nChunkInstance = column->GetSparseValueInstanceIndexAt(nColumnValueIndex); + cLogProb = attribute->GetLnSourceConditionalProb( + column->GetSparseValueAt(nColumnValueIndex), nTarget); + cvScores->UpgradeAt(nChunkInstance, cDeltaWeight * cLogProb); } - - // Mise a jour du vecteur de sortie pour le chunk courant - nChunkInstanceNumber = layout.GetInstanceNumberAtChunk(nChunk); + } + // Cas sparse : On ne met a jour les scores de toutes les instances + else + { + nChunkInstanceNumber = layout.GetInstanceNumberAtChunk(chunkBuffer.GetChunkIndex()); for (nChunkInstance = 0; nChunkInstance < nChunkInstanceNumber; nChunkInstance++) { cLogProb = attribute->GetLnSourceConditionalProb( - ivRecodedAttributeIndexes->GetAt(nChunkInstance), nTarget); - cvScores->UpgradeAt(nInstance, cDeltaWeight * cLogProb); - nInstance++; + column->GetDenseValueAt(nChunkInstance), nTarget); + cvScores->UpgradeAt(nChunkInstance, cDeltaWeight * cLogProb); } } } @@ -2357,11 +2576,13 @@ void SNBDataTableBinarySliceSet::WriteContentsAsTSV(ostream& ost) { if (layout.IsInitialized()) { - ost << layout << endl; - dataBuffer.WriteContentsAsTSV(ost, &schema); + ost << "Binary Slice Set\n"; + ost << layout << "\n\n"; + chunkBuffer.WriteContentsAsTSV(ost, &schema); + ost << "----------\n"; } else - ost << "WriteContentsAsTSV: Layout not initialized" << endl; + ost << "Binary slice set layout not initialized\n"; } boolean SNBDataTableBinarySliceSet::IsError() const @@ -2381,16 +2602,14 @@ boolean SNBDataTableBinarySliceSet::Check() const int nSlice; int nSliceAttribute; SNBDataTableBinarySliceSetAttribute* attribute; - int nChunk; if (dataPreparationClass != NULL) { - // bOk = bOk and oaUsedDataPreparationAttributes != NULL; + // bIsInitialized = bIsInitialized and oaUsedDataPreparationAttributes != NULL; bOk = bOk and dataPreparationClass->CheckDataPreparation(); bOk = bOk and dataPreparationClass->GetDataPreparationClass()->Check(); // TODO: GetDataPreparationAttributes n'est pas const ! - // bOk = bOk and nInitialAttributeNumber == - // dataPreparationClass.GetDataPreparationAttributes()->GetSize(); + // bIsInitialized = bIsInitialized and nInitialAttributeNumber == dataPreparationClass.GetDataPreparationAttributes()->GetDataSize(); } if (schema.IsInitialized()) @@ -2427,12 +2646,12 @@ boolean SNBDataTableBinarySliceSet::Check() const } } - if (dataBuffer.IsInitialized()) + if (chunkBuffer.IsInitialized()) { bOk = bOk and layout.IsInitialized(); bOk = bOk and schema.IsInitialized(); - bOk = bOk and dataBuffer.GetLayout() == &layout; - bOk = bOk and dataBuffer.Check(); + bOk = bOk and chunkBuffer.GetLayout() == &layout; + bOk = bOk and chunkBuffer.Check(); } if (ivTargetValueIndexes.GetSize() > 0) @@ -2441,19 +2660,6 @@ boolean SNBDataTableBinarySliceSet::Check() const bOk = bOk and ivTargetValueIndexes.GetSize() == layout.GetInstanceNumber(); } - if (nInitializedChunkIndex >= 0) - { - bOk = bOk and layout.IsInitialized(); - bOk = bOk and (0 <= nInitializedChunkIndex and nInitializedChunkIndex < layout.GetChunkNumber()); - bOk = bOk and dataBuffer.IsInitializedAtChunk(nInitializedChunkIndex); - - for (nChunk = 0; nChunk < layout.GetChunkNumber(); nChunk++) - { - if (nChunk != nInitializedChunkIndex) - bOk = bOk and not dataBuffer.IsInitializedAtChunk(nChunk); - } - } - bOk = bOk and not bIsError; return bOk; diff --git a/src/Learning/SNBPredictor/SNBDataTableBinarySliceSet.h b/src/Learning/SNBPredictor/SNBDataTableBinarySliceSet.h index 59884c399..30765e609 100644 --- a/src/Learning/SNBPredictor/SNBDataTableBinarySliceSet.h +++ b/src/Learning/SNBPredictor/SNBDataTableBinarySliceSet.h @@ -9,7 +9,8 @@ class SNBDataTableBinarySliceSetAttribute; class PLShared_DataTableBinarySliceSetAttribute; class SNBDataTableBinarySliceSetSchema; class SNBDataTableBinarySliceSetRandomizedAttributeIterator; -class SNBDataTableBinarySliceSetBuffer; +class SNBDataTableBinarySliceSetColumn; +class SNBDataTableBinarySliceSetChunkBuffer; class SNBDataTableBinarySliceSet; #include "KWDataPreparationClass.h" @@ -76,11 +77,8 @@ class SNBDataTableBinarySliceSetLayout : public Object // Index d'un attribut dans sa slice int GetRelativeIndexAtAttribute(int nAttribute) const; - // Nombre de `int`s contenus dans un bloc - longint GetBlockSizeAt(int nChunk, int nSlice) const; - - // Offset en bytes d'un bloc dans son fichier de chunk - longint GetBlockOffsetAt(int nChunk, int nSlice) const; + // True si l'attribute est sparse + boolean IsAttributeSparseAt(int nAttribute) const; ////////////////////////////// // Services divers @@ -131,11 +129,8 @@ class SNBDataTableBinarySliceSetLayout : public Object // Indexes des attributs relatifs a ses slices IntVector ivAttributeRelativeIndexes; - // Tableau de LongintVectors's contenant les tailles des des blocs - ObjectArray oaBlockSizes; - - // Tableau de LongintVector's contenant les offsets des blocs - ObjectArray oaBlockOffsets; + // Indique si un attribut est sparse + IntVector ivAttributeSparseModes; }; ///////////////////////////////////////////////////////////////////////////////////////////////////// @@ -376,7 +371,7 @@ class SNBDataTableBinarySliceSetRandomizedAttributeIterator : public Object /////////////////////////////////////////////////////////////////////////////////////////////////// //// Implementation -private: +protected: // Ensemble d'attributs de la database const SNBDataTableBinarySliceSetSchema* schema; @@ -391,16 +386,154 @@ class SNBDataTableBinarySliceSetRandomizedAttributeIterator : public Object }; ///////////////////////////////////////////////////////////////////////////////////////////////////// -// Buffer de lecture de une SNBDataTableBinarySliceSet -// Son initialisation entraine l'ecriture des fichiers de donnees binaires a partir -// d'un KWDataTableSliceSet et une classe de recodage +// Colonne pour la lecture de donnees d'un attribut d'un SNBDataTableBinarySliceSet +// +class SNBDataTableBinarySliceSetColumn : public Object +{ +public: + // Constructeur + SNBDataTableBinarySliceSetColumn(); + ~SNBDataTableBinarySliceSetColumn(); + + // Parametrage du mode sparse + void SetSparse(boolean bSparseValue); + boolean IsSparse() const; + + ///////////////////////////////////////////////////////////////////////////// + // API d'acces aux donnes (read-only) + + // Nombre de valeurs stockes + int GetValueNumber() const; + + // Acces a une valeur normal + int GetDenseValueAt(int nValueIndex) const; + + // Acces a une valeur sparse + int GetSparseValueAt(int nValueIndex) const; + + // Acces a l'index d'instance d'une valeur sparse + int GetSparseValueInstanceIndexAt(int nValueIndex) const; + + //////////////////////////////////////////////////////////////////// + // API de bas niveau independante du type stocke (read-write) + + // Taille des donnees (en nombre de `int`s) de la colonne + boolean SetDataSize(int nSize); + int GetDataSize() const; + + // Acces aux donnees + void SetDataAt(int nDataIndex, int nDataValue); + int GetDataAt(int nDataIndex) const; + + // Ajout a la fin du tableau de donnees + void AddData(int nDataValue); + + ////////////////////////////////////// + // Services Divers + boolean Check() const override; + + /////////////////////////////////////////////////////////////////////////////////////////////////// + //// Implementation +protected: + // True si les donnees sont stockes en mode sparse + boolean bIsSparse; + + // Tableau des donnees de l'attribut + IntVector ivData; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////////// +// Layout physique d'un chunk +// +class SNBDataTableBinarySliceSetChunkPhysicalLayout : public Object +{ +public: + // Constructeur + SNBDataTableBinarySliceSetChunkPhysicalLayout(); + ~SNBDataTableBinarySliceSetChunkPhysicalLayout(); + + ///////////////////////////////////// + // Initialisation + + // Initialisation partielle + void PartiallyInitialize(int nChunk, const SNBDataTableBinarySliceSetLayout* someLayout); + + // True si le parametrage du layout et du index du chunk est coherent + boolean IsPartiallyInitialized() const; + + // Finalise l'initialisation une fois saisies les tailles des donnees d'attributs + void FinishInitialization(); + + // True si l'initialisation est complete + boolean IsInitialized() const; + + ///////////////////////////////////////////// + // Acces aux informations + + // Indice du chunk + int GetChunkIndex() const; + + // Layout logique du binary slice set + const SNBDataTableBinarySliceSetLayout* GetLayout() const; + + // Nombre de `int`s contenus dans un bloc + longint GetBlockSizeAt(int nSlice) const; + + // Offset (en nombre de `int`s) d'un bloc dans son fichier de chunk + longint GetBlockOffsetAt(int nSlice) const; + + // Taille en `int`s des donnes d'un attribut + void SetAttributeDataSizeAt(int nAttribute, int lAttributeDataSize); + int GetAttributeDataSizeAt(int nAttribute) const; + + // Nombre total de `int`s du chunk + longint GetDataSize() const; + + ////////////////////////////////////// + // Services Divers + + // Ecriture vers un stream + void Write(ostream& ost) const override; + + // Test d'integrite + boolean Check() const override; + + /////////////////////////////////////////////////////////////////////////////////////////////////// + //// Implementation +protected: + ///////////////////////// + // Parametres + + // Indice du chunk + int nChunkIndex; + + // Layout logique du binary slice set + const SNBDataTableBinarySliceSetLayout* layout; + + /////////////////////////////////// + // Objets de travail + + // Tailles des blocs du chunk + LongintVector lvBlockSizes; + + // Offsets des blocks du chunk + LongintVector lvBlockOffsets; + + // Tailles des donnees des attributs + IntVector ivAttributeDataSizes; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////////// +// Buffer de lecture pour un chunk d'un SNBDataTableBinarySliceSet +// Son initialisation entraine l'ecriture des fichiers de donnees binaires a partir d'un +// KWDataTableSliceSet et une classe de recodage. // Apres initialisation les fichiers sont Read-Only jusqu'a une nouvelle initialisation -class SNBDataTableBinarySliceSetBuffer : public Object +class SNBDataTableBinarySliceSetChunkBuffer : public Object { public: // Constructeur - SNBDataTableBinarySliceSetBuffer(); - ~SNBDataTableBinarySliceSetBuffer(); + SNBDataTableBinarySliceSetChunkBuffer(); + ~SNBDataTableBinarySliceSetChunkBuffer(); ///////////////////////////// // Initialisation @@ -411,21 +544,14 @@ class SNBDataTableBinarySliceSetBuffer : public Object const SNBDataTableBinarySliceSetLayout* GetLayout() const; // Initialisation - boolean Initialize(KWClass* recoderClass, KWDataTableSliceSet* sliceSet, + boolean Initialize(int nChunk, KWClass* recoderClass, KWDataTableSliceSet* sliceSet, const SNBDataTableBinarySliceSetSchema* schema); - // Initialisation pour un seul chunk - boolean InitializeOnlyAtChunk(int nChunk, KWClass* recoderClass, KWDataTableSliceSet* sliceSet, - SNBDataTableBinarySliceSetSchema* schema); - - // True si tous les chunks sont initialises + // True si initalisee boolean IsInitialized() const; - // True si le chunk specifie est initialise - boolean IsInitializedAtChunk(int nChunk) const; - - // True si le buffer est initialize pour un seul chunk - boolean IsInitializedOnlyAtChunk(int nChunk) const; + // Index du chunk associe a buffer + int GetChunkIndex() const; // Nettoyage : libere la memoire et efface les fichiers de donnees void CleanWorkingData(); @@ -434,7 +560,7 @@ class SNBDataTableBinarySliceSetBuffer : public Object // Acces aux donnees // Recollection des indices d'un chunk pour un attribut - boolean CollectRecodedAttributeIndexes(int nChunk, int nAttribute, IntVector*& ivOutput); + boolean CollectRecodedAttributeIndexes(int nAttribute, SNBDataTableBinarySliceSetColumn*& outputColumn); ////////////////////////////// // Services divers @@ -459,15 +585,15 @@ class SNBDataTableBinarySliceSetBuffer : public Object // Initialisation // Initialisation du buffer de lecture - boolean InitializeBuffer(); + boolean InitializeColumns(); // Initialization du fichier de donnees d'un chunk a partir d'un slice set et une classe de recodage - boolean InitializeDataFileAtChunk(int nChunk, KWClass* recoderClass, KWDataTableSliceSet* sliceSet, - const SNBDataTableBinarySliceSetSchema* schema); + boolean InitializeDataFileFromSliceSetAt(KWClass* recoderClass, KWDataTableSliceSet* sliceSet, + const SNBDataTableBinarySliceSetSchema* schema); // Charge un bloc en memoire depuis un KWDataTableSliceSet - boolean LoadBlockFromSliceSetAt(int nChunk, int nSlice, KWClass* recoderClass, KWDataTableSliceSet* sliceSet, - const SNBDataTableBinarySliceSetSchema* schema); + boolean InitializeBlockFromSliceSetAt(int nSlice, KWClass* recoderClass, KWDataTableSliceSet* sliceSet, + const SNBDataTableBinarySliceSetSchema* schema); // Charge dans la classe de recodage (recoderClass) les attributs de la slice indiquee // Les index de chargement sont stockes dans le vecteur livLoadedRecodedAttributeIndexes @@ -479,7 +605,7 @@ class SNBDataTableBinarySliceSetBuffer : public Object const ALString CreateTempFileForChunk(int nChunk) const; // Mini API d'ecriture de fichier de donnees utilisee dans l'initialisation - boolean OpenOutputDataFileAtChunk(int nChunk); + boolean OpenOutputDataFile(); boolean WriteToDataFile(int* writeBuffer, int nIntNumber) const; boolean CloseOutputDataFile(); @@ -487,13 +613,10 @@ class SNBDataTableBinarySliceSetBuffer : public Object // Chargement de fichiers de donnees en memoire // Chargement d'un bloc : si le bloc est deja en memoire on ne fait rien - boolean LoadBlockAt(int nChunk, int nSlice); - - // Acces aux chemins des fichiers de stockage de la database - ALString GetDataFilePathAtChunk(int nChunk) const; + boolean LoadBlockAt(int nSlice); // Verifie la taille des fichiers en fonction du layout - boolean CheckDataFileAtChunk(int nChunk) const; + boolean CheckChunkFile() const; ///////////////////////// // Parametres @@ -504,21 +627,18 @@ class SNBDataTableBinarySliceSetBuffer : public Object /////////////////////////////////// // Objets de travail + // Indice du chunk lie a ce buffer + int nChunkIndex; + + // Layout physique du chunk + SNBDataTableBinarySliceSetChunkPhysicalLayout physicalLayout; + // Chemins des fichiers pour chaque chunk initialisee - StringVector svChunkFilePaths; + ALString sChunkFilePath; // Tableau contenant les donnees en memoire ObjectArray oaLoadedBlock; - // True si il n'y que un seul chunk initialise - boolean bIsSingleChunkBuffer; - - // Indice du seul chunk initialize si bIsSingleChunkBuffer - int nSingleInitializedChunkIndex; - - // Indice du chunk charge en memoire - int nLoadedBlockChunkIndex; - // Indice de la slice charge en memoire int nLoadedBlockSliceIndex; @@ -532,10 +652,6 @@ class SNBDataTableBinarySliceSetBuffer : public Object // Indice du fichier ouvert pendant l'initialisation int nOpenFileChunkIndex; - // TODO FOR FELIPE: a supprimer - //// Taille du buffer en nombre de int's - // static const int nIntBufferSize = MemSegmentByteSize / sizeof(int); - // La tache parallele d'apprentissage est friend pour faire des initialisations partielles friend class SNBPredictorSNBTrainingTask; }; @@ -556,16 +672,17 @@ class SNBDataTableBinarySliceSet : public Object // Initialisation // Initialisation complete - void Initialize(KWClassStats* classStats, int nChunkNumber, int nMaxAttributes, int nSliceNumber); + void InitializeWithoutChunkBuffer(KWClassStats* classStats, int nChunkNumber, int nMaxAttributes, + int nSliceNumber); - // Initialisation du schema d'attributs a partir d'un KWClassStats - void InitializeSchemaFromClassStats(KWClassStats* classStats, int nMaxAttributes); + // True si le schema est specifie et il y au moins un chunk initialise pour lire + boolean IsInitializedWithoutBuffer() const; - // True si le schema est specifie et tous les chunks initialises pour lire + // True si le schema est specifie et un chunk est pret a lire boolean IsInitialized() const; - // True si le schema est specifie et il y au moins un chunk initialise pour lire - boolean IsPartiallyInitialized() const; + // Initialisation du schema d'attributs a partir d'un KWClassStats + void InitializeSchemaFromClassStats(KWClassStats* classStats, int nMaxAttributes); // True s'il y a des attributs utiles boolean HasUsableAttributes() const; @@ -579,7 +696,7 @@ class SNBDataTableBinarySliceSet : public Object // Nombre d'attributs int GetAttributeNumber() const; - // Nombre d'attributs initiales (inclu les non predictifs) + // Nombre d'attributs initiales (inclue les non predictifs) int GetInitialAttributeNumber() const; // Acces aux attributs par index @@ -682,11 +799,8 @@ class SNBDataTableBinarySliceSet : public Object // Index de partie associe a une valeur d'une cible continue int ComputeContinuousTargetValuePartIndex(Continuous cTargetValue, const KWClassStats* classStats) const; - // Initialisation du buffer et tous les fichiers de chunk - boolean InitializeAllChunks(KWDataTableSliceSet* sliceSet); - // Initialisation partielle du buffer pour le chunk specifie - boolean InitializeBufferOnlyAtChunk(int nChunk, KWDataTableSliceSet* sliceSet); + boolean InitializeBufferAtChunk(int nChunk, KWDataTableSliceSet* sliceSet); ///////////////////////////////// // Objects de travail @@ -705,7 +819,7 @@ class SNBDataTableBinarySliceSet : public Object SNBDataTableBinarySliceSetRandomizedAttributeIterator randomizedAttributeIterator; // Buffer de lecture par blocs - SNBDataTableBinarySliceSetBuffer dataBuffer; + SNBDataTableBinarySliceSetChunkBuffer chunkBuffer; // Nombre d'attributs initiaux int nInitialAttributeNumber; @@ -713,9 +827,6 @@ class SNBDataTableBinarySliceSet : public Object // Indices des valeurs de la cible IntVector ivTargetValueIndexes; - // Index du seul chunk initialise pour l'initialisation partielle du buffer - int nInitializedChunkIndex; - // True s'il a eu un erreur de lecture du disque boolean bIsError; diff --git a/src/Learning/SNBPredictor/SNBPredictorSelectionDataCostCalculator.cpp b/src/Learning/SNBPredictor/SNBPredictorSelectionDataCostCalculator.cpp index e7e25b49d..30a59a9f7 100644 --- a/src/Learning/SNBPredictor/SNBPredictorSelectionDataCostCalculator.cpp +++ b/src/Learning/SNBPredictor/SNBPredictorSelectionDataCostCalculator.cpp @@ -48,7 +48,7 @@ SNBTargetPart* SNBPredictorSelectionDataCostCalculator::GetOrCreatePart() boolean bAllocOk; require(GetDataTableBinarySliceSet() != NULL); - require(GetDataTableBinarySliceSet()->IsPartiallyInitialized()); + require(GetDataTableBinarySliceSet()->IsInitialized()); // S'il y a un objet SNBTargetPart disponible dans le cache on le retourne if (olDeletedPartsCache.GetCount() > 0) @@ -72,7 +72,7 @@ void SNBPredictorSelectionDataCostCalculator::ReleasePart(SNBTargetPart* targetP { require(targetPart != NULL); require(GetDataTableBinarySliceSet() != NULL); - require(GetDataTableBinarySliceSet()->IsPartiallyInitialized()); + require(GetDataTableBinarySliceSet()->IsInitializedWithoutBuffer()); require(targetPart->GetScores()->GetSize() == GetDataTableBinarySliceSet()->GetActiveInstanceNumber()); // On garde dans le cache la SNBTargetPart eliminee pour une utilisation ulteriure si besoin @@ -235,7 +235,7 @@ double SNBClassifierSelectionDataCostCalculator::ComputeSelectionDataCost() double dLaplaceNumerator; require(Check()); - require(GetDataTableBinarySliceSet()->IsPartiallyInitialized()); + require(GetDataTableBinarySliceSet()->IsInitialized()); // Entete de la trace if (bLocalTrace) @@ -396,7 +396,7 @@ boolean SNBClassifierSelectionDataCostCalculator::UpdateTargetPartScoresWithWeig require(attribute != NULL); require(GetDataTableBinarySliceSet() != NULL); - require(GetDataTableBinarySliceSet()->IsPartiallyInitialized()); + require(GetDataTableBinarySliceSet()->IsInitialized()); require(Check()); // Calcul des probabilites conditionnelles par classe cible @@ -464,9 +464,9 @@ boolean SNBClassifierSelectionDataCostCalculator::Check() const boolean SNBClassifierSelectionDataCostCalculator::CheckParts() const { boolean bOk = true; + ALString sTmp = ""; int nTarget; SNBTargetPart* targetPart; - ALString sTmp; for (nTarget = 0; nTarget < GetTargetDescriptiveStats()->GetValueNumber(); nTarget++) { @@ -1354,8 +1354,7 @@ void SNBGeneralizedClassifierSelectionDataCostCalculator::InitializeTargetValueG nTargetValueIndex <= targetValueGroupsPartition->GetGroupLastValueIndexAt(nGroup); nTargetValueIndex++) { - // Memorisation du groupe par defaut si l'on rencontre la valeur speciale - // StarValue + // Memorisation du groupe par defaut si l'on rencontre la valeur speciale StarValue if (nDefaultGroup == -1 and targetValueGroupsPartition->GetValueAt( nTargetValueIndex) == Symbol::GetStarValue()) { diff --git a/src/Learning/SNBPredictor/SNBPredictorSelectiveNaiveBayesTrainingTask.cpp b/src/Learning/SNBPredictor/SNBPredictorSelectiveNaiveBayesTrainingTask.cpp index a5d141d5b..a30e26a88 100644 --- a/src/Learning/SNBPredictor/SNBPredictorSelectiveNaiveBayesTrainingTask.cpp +++ b/src/Learning/SNBPredictor/SNBPredictorSelectiveNaiveBayesTrainingTask.cpp @@ -429,7 +429,7 @@ longint SNBPredictorSNBTrainingTask::ComputeGlobalSlaveNecessaryMemory(int nSlic // La memoire global du buffer et du scorer s'estime avec les estimations des objets necessaires avec un seul // chunk lGlobalDataCostCalculatorMemory = ComputeGlobalSlaveScorerNecessaryMemory(); - lGlobalBinarySliceSetBufferMemory = SNBDataTableBinarySliceSetBuffer::ComputeNecessaryMemory( + lGlobalBinarySliceSetBufferMemory = SNBDataTableBinarySliceSetChunkBuffer::ComputeNecessaryMemory( nInstanceNumber, 1, nAttributeNumber, nSliceNumber); // Estimation de la memoire necessaire pour le recodage @@ -496,7 +496,7 @@ longint SNBPredictorSNBTrainingTask::ComputeSlaveNecessaryMemory(int nSlaveProce lBinarySliceSetSelfMemory = sizeof(SNBDataTableBinarySliceSet) - sizeof(SNBDataTableBinarySliceSetSchema) - sizeof(SNBDataTableBinarySliceSetLayout) - sizeof(SNBDataTableBinarySliceSetRandomizedAttributeIterator) - - sizeof(SNBDataTableBinarySliceSetBuffer); + sizeof(SNBDataTableBinarySliceSetChunkBuffer); lSelectionScorerMemory = ComputeSlaveScorerNecessaryMemory(); // Memoire du scorer sans ses contenus @@ -716,12 +716,12 @@ longint SNBPredictorSNBTrainingTask::ComputeGlobalSlaveNecessaryDisk() const int nAttributeNumber; longint lGlobalSlaveDisk; - // On estime la capacite totale du disque par la taille du SNBDataTableBinarySliceSetBuffer avec + // On estime la capacite totale du disque par la taille du SNBDataTableBinarySliceSetChunkBuffer avec // un seul esclave et une seule slice (un peu sur-estimee) nInstanceNumber = masterSnbPredictor->GetInstanceNumber(); nAttributeNumber = masterSnbPredictor->GetTrainingAttributeNumber(); lGlobalSlaveDisk = - SNBDataTableBinarySliceSetBuffer::ComputeNecessaryMemory(nInstanceNumber, 1, nAttributeNumber, 1); + SNBDataTableBinarySliceSetChunkBuffer::ComputeNecessaryMemory(nInstanceNumber, 1, nAttributeNumber, 1); return lGlobalSlaveDisk; } @@ -833,14 +833,9 @@ boolean SNBPredictorSNBTrainingTask::MasterInitializeDataTableBinarySliceSet() { // Initialisation du SNBDataTableBinarySliceSet du maitre masterBinarySliceSet = new SNBDataTableBinarySliceSet; - masterBinarySliceSet->InitializeSchemaFromClassStats( - masterSnbPredictor->GetClassStats(), - masterSnbPredictor->GetTrainParameters()->GetMaxEvaluatedAttributeNumber()); - masterBinarySliceSet->layout.Initialize(masterSnbPredictor->GetInstanceNumber(), nSlaveProcessNumber, - masterBinarySliceSet->GetAttributeNumber(), nSliceNumber); - masterBinarySliceSet->randomizedAttributeIterator.Initialize(&masterBinarySliceSet->schema, - &masterBinarySliceSet->layout); - masterBinarySliceSet->InitializeTargetValueIndexes(masterSnbPredictor->GetClassStats()); + masterBinarySliceSet->InitializeWithoutChunkBuffer( + masterSnbPredictor->GetClassStats(), nSlaveProcessNumber, + masterSnbPredictor->GetTrainParameters()->GetMaxEvaluatedAttributeNumber(), nSliceNumber); // Changement le domaine de travail a celui de la classe de recodage recoderClass = masterBinarySliceSet->GetDataPreparationClass()->GetDataPreparationClass(); @@ -892,21 +887,10 @@ boolean SNBPredictorSNBTrainingTask::MasterInitializeDataTableBinarySliceSet() boolean SNBPredictorSNBTrainingTask::IsMasterDataTableBinarySliceSetInitialized() const { boolean bOk = true; - int nChunk; bOk = bOk and masterBinarySliceSet != NULL; + bOk = bOk and masterBinarySliceSet->IsInitializedWithoutBuffer(); bOk = bOk and masterBinarySliceSet->dataPreparationClass != NULL; - bOk = bOk and masterBinarySliceSet->schema.IsInitialized(); - bOk = bOk and masterBinarySliceSet->layout.IsInitialized(); - bOk = bOk and masterBinarySliceSet->randomizedAttributeIterator.IsInitialized(); - bOk = bOk and masterBinarySliceSet->ivTargetValueIndexes.GetSize() > 0; - bOk = bOk and masterBinarySliceSet->nInitializedChunkIndex < 0; - bOk = bOk and not masterBinarySliceSet->dataBuffer.IsInitialized(); - if (bOk) - { - for (nChunk = 0; nChunk < masterBinarySliceSet->layout.GetChunkNumber(); nChunk++) - bOk = bOk and not masterBinarySliceSet->dataBuffer.IsInitializedOnlyAtChunk(nChunk); - } return bOk; } @@ -1487,12 +1471,10 @@ boolean SNBPredictorSNBTrainingTask::SlaveInitializeDataTableBinarySliceSet() slaveBinarySliceSet->schema.InitializeFromAttributes(oaAttributes); // Initialisation du buffer de lecture de donnees pour le chunk l'esclave - slaveBinarySliceSet->dataBuffer.SetLayout(&slaveBinarySliceSet->layout); - slaveBinarySliceSet->dataBuffer.InitializeBuffer(); - slaveBinarySliceSet->nInitializedChunkIndex = GetSlaveChunkIndex(); - bOk = slaveBinarySliceSet->dataBuffer.InitializeOnlyAtChunk(GetSlaveChunkIndex(), slaveRecoderClass, - shared_dataTableSliceSet.GetDataTableSliceSet(), - &slaveBinarySliceSet->schema); + slaveBinarySliceSet->chunkBuffer.SetLayout(&slaveBinarySliceSet->layout); + bOk = slaveBinarySliceSet->chunkBuffer.Initialize(GetSlaveChunkIndex(), slaveRecoderClass, + shared_dataTableSliceSet.GetDataTableSliceSet(), + &slaveBinarySliceSet->schema); // En parallele on decharge la classe de recodage if (IsParallel()) @@ -1505,12 +1487,7 @@ boolean SNBPredictorSNBTrainingTask::SlaveInitializeDataTableBinarySliceSet() boolean SNBPredictorSNBTrainingTask::IsSlaveDataTableBinarySliceSetInitialized() const { - return slaveBinarySliceSet != NULL and slaveBinarySliceSet->dataPreparationClass == NULL and - slaveBinarySliceSet->schema.IsInitialized() and slaveBinarySliceSet->layout.IsInitialized() and - not slaveBinarySliceSet->randomizedAttributeIterator.IsInitialized() and - slaveBinarySliceSet->ivTargetValueIndexes.GetSize() > 0 and - slaveBinarySliceSet->nInitializedChunkIndex >= 0 and - slaveBinarySliceSet->dataBuffer.IsInitializedOnlyAtChunk(GetSlaveChunkIndex()); + return slaveBinarySliceSet != NULL and slaveBinarySliceSet->IsInitialized(); } void SNBPredictorSNBTrainingTask::SlaveInitializeUnloadRecoderClass() @@ -1553,7 +1530,7 @@ boolean SNBPredictorSNBTrainingTask::SlaveProcess() boolean SNBPredictorSNBTrainingTask::SlaveFinalize(boolean bProcessEndedCorrectly) { require(IsSlaveDataTableBinarySliceSetInitialized()); - require(slaveBinarySliceSet->Check()); + require(slaveBinarySliceSet == NULL or slaveBinarySliceSet->Check()); require(slaveRecoderClass != NULL or IsParallel()); // Nettoyage des objets de travail l'esclave @@ -2609,7 +2586,7 @@ boolean SNBPredictorSNBDirectTrainingTask::SlaveFinalize(boolean bProcessEndedCo { boolean bOk; - require(slaveWeightedSelectionScorer != NULL or not bProcessEndedCorrectly); + require(not bProcessEndedCorrectly or slaveWeightedSelectionScorer != NULL); // Nettoyage du scorer de l'esclave, qui peut etre nul s'il y a eu un probleme d'initialisation if (slaveWeightedSelectionScorer != NULL)