/** Returns tuple: {Gradient,Delta,Output} given preOut */ private Pair<Gradient, INDArray> getGradientsAndDelta(INDArray preOut) { ILossFunction lossFunction = layerConf().getLossFn(); INDArray labels2d = getLabels2d(); //INDArray delta = lossFunction.computeGradient(labels2d, preOut, layerConf().getActivationFunction(), maskArray); INDArray delta = lossFunction.computeGradient(labels2d, preOut, layerConf().getActivationFn(), maskArray); Gradient gradient = new DefaultGradient(); INDArray weightGradView = gradientViews.get(DefaultParamInitializer.WEIGHT_KEY); INDArray biasGradView = gradientViews.get(DefaultParamInitializer.BIAS_KEY); Nd4j.gemm(input, delta, weightGradView, true, false, 1.0, 0.0); //Equivalent to: weightGradView.assign(input.transpose().mmul(delta)); delta.sum(biasGradView, 0); //biasGradView is initialized/zeroed first in sum op gradient.gradientForVariable().put(DefaultParamInitializer.WEIGHT_KEY, weightGradView); gradient.gradientForVariable().put(DefaultParamInitializer.BIAS_KEY, biasGradView); return new Pair<>(gradient, delta); }
@Override public LayerMemoryReport getMemoryReport(InputType inputType) { //Basically a dense layer... InputType outputType = getOutputType(-1, inputType); int numParams = initializer().numParams(this); int updaterStateSize = (int) getIUpdater().stateSize(numParams); int trainSizeFixed = 0; int trainSizeVariable = 0; if (getDropOut() > 0) { if (false) { //TODO drop connect //Dup the weights... note that this does NOT depend on the minibatch size... trainSizeVariable += 0; //TODO } else { //Assume we dup the input trainSizeVariable += inputType.arrayElementsPerExample(); } } //Also, during backprop: we do a preOut call -> gives us activations size equal to the output size // which is modified in-place by activation function backprop // then we have 'epsilonNext' which is equivalent to input size trainSizeVariable += outputType.arrayElementsPerExample(); return new LayerMemoryReport.Builder(layerName, OutputLayer.class, inputType, outputType) .standardMemory(numParams, updaterStateSize) .workingMemory(0, 0, trainSizeFixed, trainSizeVariable) //No additional memory (beyond activations) for inference .cacheMemory(MemoryReport.CACHE_MODE_ALL_ZEROS, MemoryReport.CACHE_MODE_ALL_ZEROS) //No caching .build(); }
} else if (layer instanceof BaseOutputLayer) { BaseOutputLayer ol = (BaseOutputLayer) layer; map.put("Loss Function", ol.getLossFn().toString());
for (NeuralNetConfiguration nnc : conf.getConfs()) { Layer l = nnc.getLayer(); if (l instanceof BaseOutputLayer && ((BaseOutputLayer) l).getLossFn() == null) { switch (lossFunction) { case MSE: ol.setLossFn(new LossMSE()); break; case XENT: ol.setLossFn(new LossBinaryXENT()); break; case NEGATIVELOGLIKELIHOOD: ol.setLossFn(new LossNegativeLogLikelihood()); break; case MCXENT: ol.setLossFn(new LossMCXENT()); break;
} else if (layer instanceof BaseOutputLayer) { BaseOutputLayer ol = (BaseOutputLayer) layer; map.put("Loss Function", ol.getLossFn().toString());
/**Compute the score for each example individually, after labels and input have been set. * * @param fullNetworkL1 L1 regularization term for the entire network (or, 0.0 to not include regularization) * @param fullNetworkL2 L2 regularization term for the entire network (or, 0.0 to not include regularization) * @return A column INDArray of shape [numExamples,1], where entry i is the score of the ith example */ @Override public INDArray computeScoreForExamples(double fullNetworkL1, double fullNetworkL2) { if (input == null || labels == null) throw new IllegalStateException("Cannot calculate score without input and labels " + layerId()); INDArray preOut = preOutput2d(false); ILossFunction lossFunction = layerConf().getLossFn(); INDArray scoreArray = lossFunction.computeScoreArray(getLabels2d(), preOut, layerConf().getActivationFn(), maskArray); double l1l2 = fullNetworkL1 + fullNetworkL2; if (l1l2 != 0.0) { scoreArray.addi(l1l2); } return scoreArray; }
} else if (layer instanceof BaseOutputLayer) { BaseOutputLayer ol = (BaseOutputLayer) layer; map.put("Loss Function", ol.getLossFn().toString());
/** Compute score after labels and input have been set. * @param fullNetworkL1 L1 regularization term for the entire network * @param fullNetworkL2 L2 regularization term for the entire network * @param training whether score should be calculated at train or test time (this affects things like application of * dropout, etc) * @return score (loss function) */ @Override public double computeScore(double fullNetworkL1, double fullNetworkL2, boolean training) { if (input == null || labels == null) throw new IllegalStateException("Cannot calculate score without input and labels " + layerId()); this.fullNetworkL1 = fullNetworkL1; this.fullNetworkL2 = fullNetworkL2; INDArray preOut = preOutput2d(training); ILossFunction lossFunction = layerConf().getLossFn(); //double score = lossFunction.computeScore(getLabels2d(), preOut, layerConf().getActivationFunction(), maskArray, false); double score = lossFunction.computeScore(getLabels2d(), preOut, layerConf().getActivationFn(), maskArray, false); score += fullNetworkL1 + fullNetworkL2; score /= getInputMiniBatchSize(); this.score = score; return score; }
private static void configureSoftmaxClippingIfPresent(IOutputLayer outputLayer){ ILossFunction lfn = null; IActivation afn = null; if(outputLayer instanceof BaseOutputLayer){ BaseOutputLayer o = (BaseOutputLayer)outputLayer; lfn = ((org.deeplearning4j.nn.conf.layers.BaseOutputLayer)o.layerConf()).getLossFn(); afn = o.layerConf().getActivationFn(); } else if(outputLayer instanceof LossLayer){ LossLayer o = (LossLayer) outputLayer; lfn = o.layerConf().getLossFn(); afn = o.layerConf().getActivationFn(); } if (lfn instanceof LossMCXENT && afn instanceof ActivationSoftmax && ((LossMCXENT) lfn).getSoftmaxClipEps() != 0) { log.info("Setting softmax clipping epsilon to 0.0 for " + lfn.getClass() + " loss function to avoid spurious gradient check failures"); ((LossMCXENT) lfn).setSoftmaxClipEps(0.0); } }