diff --git a/Source/1BitSGD/BlockMomentumDistributedLearner.h b/Source/1BitSGD/BlockMomentumDistributedLearner.h index 1585bd4c2588..d0366cbac94e 100644 --- a/Source/1BitSGD/BlockMomentumDistributedLearner.h +++ b/Source/1BitSGD/BlockMomentumDistributedLearner.h @@ -524,38 +524,40 @@ namespace CNTK ResetBuffer(i, p); else if (p->GetDataType() == DataType::Float) ResetBuffer(i, p); + else if (p->GetDataType() == DataType::Float16) + ResetBuffer(i, p); else RuntimeError("Unsupported type."); } } - template + template void ResetBuffer(size_t index, const NDArrayViewPtr& p) { - auto data = p->GetMatrix(); + auto data = p->GetMatrix(); if (!m_blockLevelSmoothedGradient[index]) { // has not been initialized yet - auto pSmoothedGrad = std::make_shared(AsDataType(), p->Shape(), AsDeviceDescriptor(data->GetDeviceId())); - pSmoothedGrad->SetValue(static_cast(0)); + auto pSmoothedGrad = std::make_shared(AsDataType(), p->Shape(), AsDeviceDescriptor(data->GetDeviceId())); + pSmoothedGrad->SetValue(static_cast(0)); m_blockLevelSmoothedGradient[index] = pSmoothedGrad; } if (!m_prevParameters[index]) { - NDArrayViewPtr newValue = std::make_shared(AsDataType(), p->Shape(), AsDeviceDescriptor(data->GetDeviceId())); - std::shared_ptr> newData = newValue->GetWritableMatrix(); + NDArrayViewPtr newValue = std::make_shared(AsDataType(), p->Shape(), AsDeviceDescriptor(data->GetDeviceId())); + std::shared_ptr> newData = newValue->GetWritableMatrix(); newData->SetValue(*data); m_prevParameters[index] = newValue; } else { - m_prevParameters[index]->GetWritableMatrix()->SetValue(*data); + m_prevParameters[index]->GetWritableMatrix()->SetValue(*data); } if (!m_tempBlockGradient[index]) { - m_tempBlockGradient[index] = std::make_shared(AsDataType(), p->Shape(), AsDeviceDescriptor(data->GetDeviceId())); + m_tempBlockGradient[index] = std::make_shared(AsDataType(), p->Shape(), AsDeviceDescriptor(data->GetDeviceId())); } } diff --git a/Source/CNTKv2LibraryDll/DistributedCommunicator.cpp b/Source/CNTKv2LibraryDll/DistributedCommunicator.cpp index f37394aab64b..0cabaef81374 100644 --- a/Source/CNTKv2LibraryDll/DistributedCommunicator.cpp +++ b/Source/CNTKv2LibraryDll/DistributedCommunicator.cpp @@ -14,6 +14,7 @@ #include "GPUDataTransferer.h" #include #include "Utils.h" +#include using namespace Microsoft::MSR::CNTK; @@ -732,6 +733,7 @@ namespace CNTK { if (m_nccl->IsSupported() && !dataOnCPU) { + std::cerr << " NCCL fp16 allreduce" << endl; m_nccl->AllReduce(inputData, outputData, numElements, op); return; diff --git a/Source/Common/Include/Sequences.h b/Source/Common/Include/Sequences.h index 2571fd6f5499..d6e330e6f63f 100644 --- a/Source/Common/Include/Sequences.h +++ b/Source/Common/Include/Sequences.h @@ -546,7 +546,8 @@ struct MBLayout LogicError("GetColumnIndex: t out of sequence bounds."); if (seq.s > GetNumParallelSequences()) LogicError("GetColumnIndex: seq.s out of sequence bounds."); // can only happen if 'seq' does not come out of our own m_sequences array, which is verboten - ptrdiff_t tIn = (ptrdiff_t)t + seq.tBegin; // shifted time index + //ptrdiff_t tIn = (ptrdiff_t)t + seq.tBegin; // shifted time index + ptrdiff_t tIn = (ptrdiff_t)t + (seq.tBegin > 0 ? seq.tBegin : 0 ); // shifted time index if (tIn < 0 || (size_t)tIn >= GetNumTimeSteps()) LogicError("GetColumnIndex: Attempted to access a time step that is accessing a portion of a sequence that is not included in current minibatch."); // we may encounter this for truncated BPTT size_t col = (size_t)tIn * GetNumParallelSequences() + seq.s; diff --git a/bindings/python/cntk/logging/progress_print.py b/bindings/python/cntk/logging/progress_print.py index 4d2649fc501c..199e2b9c8f44 100644 --- a/bindings/python/cntk/logging/progress_print.py +++ b/bindings/python/cntk/logging/progress_print.py @@ -220,6 +220,8 @@ def write(self, key, value): def ___logprint(self, logline): if self.log_to_file == None: # to stdout. if distributed, all ranks merge output into stdout + t = time.localtime() + print(str(t.tm_year) + '-' + str(t.tm_mon) + '-' + str(t.tm_mday) + ' ' + str(t.tm_hour) + ':' + str(t.tm_min) + ':' + str(t.tm_sec), end = ' ') print(logline) else: # to named file. if distributed, one file per rank