From 1d5f04581ddbdc5e9f6d1d405d429679152bdc1c Mon Sep 17 00:00:00 2001 From: MaximilianSchreff Date: Fri, 10 Jan 2025 16:02:47 +0100 Subject: [PATCH 1/7] GELU froward pass --- scripts/nn/layers/gelu.dml | 68 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 scripts/nn/layers/gelu.dml diff --git a/scripts/nn/layers/gelu.dml b/scripts/nn/layers/gelu.dml new file mode 100644 index 00000000000..a324a72d7aa --- /dev/null +++ b/scripts/nn/layers/gelu.dml @@ -0,0 +1,68 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Gaussian Error Linear Unit (GELU) nonlinearity layer. + */ + +source("nn/layers/tanh.dml") as tanh + +forward = function(matrix[double] X) + return (matrix[double] out) { + /* + * Computes the forward pass for a GELU nonlinearity layer, via + * its tanh approximation. + * + * Performs an element-wise evaluation of + * `GELU(x) = x * CDF(x)`. + * where CDF is the cumulative distribution function of the + * standard normal distribution: + * `CDF(x) = 0.5 * (1 + erf(x/sqrt(2)))` + * This implementation uses the tanh approximation: + * `CDF(x) =~ 0.5 * (1 + tanh(sqrt(2/pi) * (x + 0.044715x^3)))` + * + * Inputs: + * - X: Inputs, of shape (any, any). + * + * Outputs: + * - out: Outputs, of same shape as `X`. + */ + cdf = 0.5 * (1 + tanh(sqrt(2 / pi) * (X + 0.044715 * X^3))) + out = cdf * X +} + +backward = function(matrix[double] dout, matrix[double] X) + return (matrix[double] dX) { + /* + * Computes the backward pass for a ReLU nonlinearity layer. + * + * Essentially performs a pass-through of the upstream gradient + * for cells > 0. + * + * Inputs: + * - dout: Gradient wrt `out` from upstream, of same shape as `X`. + * - X: Previous input data matrix, of shape (any, any). + * + * Outputs: + * - dX: Gradient wrt `X`, of same shape as `X`. + */ + dX = (X > 0) * dout +} From 4fadd66b5666f92be76cceda66c07d49a2abddcc Mon Sep 17 00:00:00 2001 From: MaximilianSchreff Date: Fri, 10 Jan 2025 16:40:51 +0100 Subject: [PATCH 2/7] Component test script with forward pass test case for gelu --- .../applications/nn/component/gelu.dml | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 src/test/scripts/applications/nn/component/gelu.dml diff --git a/src/test/scripts/applications/nn/component/gelu.dml b/src/test/scripts/applications/nn/component/gelu.dml new file mode 100644 index 00000000000..61d92d56f8a --- /dev/null +++ b/src/test/scripts/applications/nn/component/gelu.dml @@ -0,0 +1,44 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("nn/layers/gelu.dml") as gelu +source("src/test/scripts/applications/nn/util.dml") as test_util + + +# Test case 1 + +X = matrix("1. -0.5 + 0. 2.", rows=2, cols=2) +out_expected = matrix("0.841192 -0.154286 + 0. 1.9545977", rows=2, cols=2) +gradient_expected = matrix("1.0829641 0.13263011 + 0.5 1.0860993", rows=2, cols=2) + +out = gelu::forward(X) + +print(toString(out)) +print(toString(abs(out - out_expected))) +test_util::check_all_close(out, out_expected, 0.00001) + +# gradient = gelu::backward() +# test_util::check_all_close(gradient, gradient_expected, 0.00001) + +# Test case 2 From c0022e1609996ec2f7ab60af9fcd85884dcd3497 Mon Sep 17 00:00:00 2001 From: MaximilianSchreff Date: Fri, 10 Jan 2025 17:02:48 +0100 Subject: [PATCH 3/7] GELU backward pass --- scripts/nn/layers/gelu.dml | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/scripts/nn/layers/gelu.dml b/scripts/nn/layers/gelu.dml index a324a72d7aa..23c1d407be1 100644 --- a/scripts/nn/layers/gelu.dml +++ b/scripts/nn/layers/gelu.dml @@ -52,10 +52,8 @@ forward = function(matrix[double] X) backward = function(matrix[double] dout, matrix[double] X) return (matrix[double] dX) { /* - * Computes the backward pass for a ReLU nonlinearity layer. - * - * Essentially performs a pass-through of the upstream gradient - * for cells > 0. + * Computes the backward pass for a GELU nonlinearity layer, via + * its tanh approximation. * * Inputs: * - dout: Gradient wrt `out` from upstream, of same shape as `X`. @@ -64,5 +62,9 @@ backward = function(matrix[double] dout, matrix[double] X) * Outputs: * - dX: Gradient wrt `X`, of same shape as `X`. */ - dX = (X > 0) * dout + a = sqrt(2 / pi) + b = 0.044715 + T = tanh(a * (X + b * X^3)) + dT = 1 - T^2 + dX = dout * (0.5 * (1 + T) + 0.5 * X * dT * a * (1 + 3 * b * X^2)) } From 1285f6852412bad8ebcf75af1352b4954c629a79 Mon Sep 17 00:00:00 2001 From: MaximilianSchreff Date: Fri, 10 Jan 2025 17:03:13 +0100 Subject: [PATCH 4/7] Added backward pass to test case 1 --- src/test/scripts/applications/nn/component/gelu.dml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/test/scripts/applications/nn/component/gelu.dml b/src/test/scripts/applications/nn/component/gelu.dml index 61d92d56f8a..6feef7b48c5 100644 --- a/src/test/scripts/applications/nn/component/gelu.dml +++ b/src/test/scripts/applications/nn/component/gelu.dml @@ -27,6 +27,8 @@ source("src/test/scripts/applications/nn/util.dml") as test_util X = matrix("1. -0.5 0. 2.", rows=2, cols=2) +dout = matrix("1 1 + 1 1", rows=2, cols=2) out_expected = matrix("0.841192 -0.154286 0. 1.9545977", rows=2, cols=2) gradient_expected = matrix("1.0829641 0.13263011 @@ -38,7 +40,7 @@ print(toString(out)) print(toString(abs(out - out_expected))) test_util::check_all_close(out, out_expected, 0.00001) -# gradient = gelu::backward() -# test_util::check_all_close(gradient, gradient_expected, 0.00001) - -# Test case 2 +gradient = gelu::backward(dout, X) +test_util::check_all_close(gradient, gradient_expected, 0.00001) +print(toString(gradient)) +print(toString(abs(gradient - gradient_expected))) From 43574ed05f45a729d489933f118020ac467bba1e Mon Sep 17 00:00:00 2001 From: MaximilianSchreff Date: Fri, 10 Jan 2025 17:03:40 +0100 Subject: [PATCH 5/7] Added another test case for forward & backward --- .../applications/nn/component/gelu.dml | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/test/scripts/applications/nn/component/gelu.dml b/src/test/scripts/applications/nn/component/gelu.dml index 6feef7b48c5..7ac079e4b0d 100644 --- a/src/test/scripts/applications/nn/component/gelu.dml +++ b/src/test/scripts/applications/nn/component/gelu.dml @@ -44,3 +44,25 @@ gradient = gelu::backward(dout, X) test_util::check_all_close(gradient, gradient_expected, 0.00001) print(toString(gradient)) print(toString(abs(gradient - gradient_expected))) + +# Test case 2 + +X = matrix("0.5 -1.5 + 1. -2.", rows=2, cols=2) +dout = matrix("1 1 + 1 1", rows=2, cols=2) +out_expected = matrix("0.345714 -0.10042843 + 0.841192 -0.04540229", rows=2, cols=2) +gradient_expected = matrix("0.8673699 -0.1277108 + 1.0829641 -0.08609922", rows=2, cols=2) + +out = gelu::forward(X) + +print(toString(out)) +print(toString(abs(out - out_expected))) +test_util::check_all_close(out, out_expected, 0.00001) + +gradient = gelu::backward(dout, X) +test_util::check_all_close(gradient, gradient_expected, 0.00001) +print(toString(gradient)) +print(toString(abs(gradient - gradient_expected))) From 6a1d063305d8e161e322f6bea4c345c5977cfe51 Mon Sep 17 00:00:00 2001 From: MaximilianSchreff Date: Fri, 10 Jan 2025 17:04:26 +0100 Subject: [PATCH 6/7] Removed comments from test script --- src/test/scripts/applications/nn/component/gelu.dml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/test/scripts/applications/nn/component/gelu.dml b/src/test/scripts/applications/nn/component/gelu.dml index 7ac079e4b0d..e52b4c58642 100644 --- a/src/test/scripts/applications/nn/component/gelu.dml +++ b/src/test/scripts/applications/nn/component/gelu.dml @@ -36,14 +36,10 @@ gradient_expected = matrix("1.0829641 0.13263011 out = gelu::forward(X) -print(toString(out)) -print(toString(abs(out - out_expected))) test_util::check_all_close(out, out_expected, 0.00001) gradient = gelu::backward(dout, X) test_util::check_all_close(gradient, gradient_expected, 0.00001) -print(toString(gradient)) -print(toString(abs(gradient - gradient_expected))) # Test case 2 @@ -58,11 +54,7 @@ gradient_expected = matrix("0.8673699 -0.1277108 out = gelu::forward(X) -print(toString(out)) -print(toString(abs(out - out_expected))) test_util::check_all_close(out, out_expected, 0.00001) gradient = gelu::backward(dout, X) test_util::check_all_close(gradient, gradient_expected, 0.00001) -print(toString(gradient)) -print(toString(abs(gradient - gradient_expected))) From 1c15d53828dd7d236670fb4f6ee849b6f1021032 Mon Sep 17 00:00:00 2001 From: MaximilianSchreff Date: Fri, 10 Jan 2025 17:10:24 +0100 Subject: [PATCH 7/7] Testing integration into NNComponentTest --- .../test/applications/nn/NNComponentTest.java | 5 ++ .../applications/nn/component/gelu.dml | 58 ++++++++++--------- 2 files changed, 37 insertions(+), 26 deletions(-) diff --git a/src/test/java/org/apache/sysds/test/applications/nn/NNComponentTest.java b/src/test/java/org/apache/sysds/test/applications/nn/NNComponentTest.java index 3b002871d73..a9922cf35f7 100644 --- a/src/test/java/org/apache/sysds/test/applications/nn/NNComponentTest.java +++ b/src/test/java/org/apache/sysds/test/applications/nn/NNComponentTest.java @@ -124,6 +124,11 @@ public void resnet() { run("resnet_bottleneck.dml"); } + @Test + public void gelu() { + run("gelu.dml"); + } + @Override protected void run(String name) { super.run("component/" + name); diff --git a/src/test/scripts/applications/nn/component/gelu.dml b/src/test/scripts/applications/nn/component/gelu.dml index e52b4c58642..3d7ea833458 100644 --- a/src/test/scripts/applications/nn/component/gelu.dml +++ b/src/test/scripts/applications/nn/component/gelu.dml @@ -22,39 +22,45 @@ source("nn/layers/gelu.dml") as gelu source("src/test/scripts/applications/nn/util.dml") as test_util +gelu_test1 = function() { + print("Testing GELU, test 1") -# Test case 1 + X = matrix("1. -0.5 + 0. 2.", rows=2, cols=2) + dout = matrix("1 1 + 1 1", rows=2, cols=2) + out_expected = matrix("0.841192 -0.154286 + 0. 1.9545977", rows=2, cols=2) + gradient_expected = matrix("1.0829641 0.13263011 + 0.5 1.0860993", rows=2, cols=2) -X = matrix("1. -0.5 - 0. 2.", rows=2, cols=2) -dout = matrix("1 1 - 1 1", rows=2, cols=2) -out_expected = matrix("0.841192 -0.154286 - 0. 1.9545977", rows=2, cols=2) -gradient_expected = matrix("1.0829641 0.13263011 - 0.5 1.0860993", rows=2, cols=2) + out = gelu::forward(X) -out = gelu::forward(X) + test_util::check_all_close(out, out_expected, 0.00001) -test_util::check_all_close(out, out_expected, 0.00001) + gradient = gelu::backward(dout, X) + test_util::check_all_close(gradient, gradient_expected, 0.00001) +} -gradient = gelu::backward(dout, X) -test_util::check_all_close(gradient, gradient_expected, 0.00001) +gelu_test2 = function() { + print("Testing GELU, test 2") -# Test case 2 + X = matrix("0.5 -1.5 + 1. -2.", rows=2, cols=2) + dout = matrix("1 1 + 1 1", rows=2, cols=2) + out_expected = matrix("0.345714 -0.10042843 + 0.841192 -0.04540229", rows=2, cols=2) + gradient_expected = matrix("0.8673699 -0.1277108 + 1.0829641 -0.08609922", rows=2, cols=2) -X = matrix("0.5 -1.5 - 1. -2.", rows=2, cols=2) -dout = matrix("1 1 - 1 1", rows=2, cols=2) -out_expected = matrix("0.345714 -0.10042843 - 0.841192 -0.04540229", rows=2, cols=2) -gradient_expected = matrix("0.8673699 -0.1277108 - 1.0829641 -0.08609922", rows=2, cols=2) + out = gelu::forward(X) -out = gelu::forward(X) + test_util::check_all_close(out, out_expected, 0.00001) -test_util::check_all_close(out, out_expected, 0.00001) + gradient = gelu::backward(dout, X) + test_util::check_all_close(gradient, gradient_expected, 0.00001) +} -gradient = gelu::backward(dout, X) -test_util::check_all_close(gradient, gradient_expected, 0.00001) +gelu_test1() +gelu_test2() \ No newline at end of file