From 39e710e805c60fbd231d6eb511f9b3de8b90e8f6 Mon Sep 17 00:00:00 2001
From: Aditya Sharad <adityasharad@github.com>
Date: Mon, 9 Jun 2025 08:37:37 -0700
Subject: [PATCH 1/6] Actions: Refactor logic for identifying command
 substitution

Extract helper predicates for `$(...)` command interpolation
and backtick-quoted commands.
Add some doc comments and meaningful variable names.
---
 actions/ql/lib/codeql/actions/Bash.qll | 68 ++++++++++++++++++--------
 1 file changed, 47 insertions(+), 21 deletions(-)

diff --git a/actions/ql/lib/codeql/actions/Bash.qll b/actions/ql/lib/codeql/actions/Bash.qll
index 4519a8949d7c..e5403c5fda72 100644
--- a/actions/ql/lib/codeql/actions/Bash.qll
+++ b/actions/ql/lib/codeql/actions/Bash.qll
@@ -8,29 +8,55 @@ class BashShellScript extends ShellScript {
     )
   }
 
-  private string lineProducer(int i) {
-    result = this.getRawScript().regexpReplaceAll("\\\\\\s*\n", "").splitAt("\n", i)
+  /**
+   * Gets the line at 0-based index `lineIndex` within this shell script,
+   * assuming newlines as separators.
+   */
+  private string lineProducer(int lineIndex) {
+    result = this.getRawScript().regexpReplaceAll("\\\\\\s*\n", "").splitAt("\n", lineIndex)
   }
 
-  private predicate cmdSubstitutionReplacement(string cmdSubs, string id, int k) {
-    exists(string line | line = this.lineProducer(k) |
-      exists(int i, int j |
-        cmdSubs =
-          // $() cmd substitution
-          line.regexpFind("\\$\\((?:[^()]+|\\((?:[^()]+|\\([^()]*\\))*\\))*\\)", i, j)
-              .regexpReplaceAll("^\\$\\(", "")
-              .regexpReplaceAll("\\)$", "") and
-        id = "cmdsubs:" + k + ":" + i + ":" + j
-      )
-      or
-      exists(int i, int j |
-        // `...` cmd substitution
-        cmdSubs =
-          line.regexpFind("\\`[^\\`]+\\`", i, j)
-              .regexpReplaceAll("^\\`", "")
-              .regexpReplaceAll("\\`$", "") and
-        id = "cmd:" + k + ":" + i + ":" + j
-      )
+  private predicate cmdSubstitutionReplacement(string command, string id, int lineIndex) {
+    this.commandInSubstitution(lineIndex, command, id)
+    or
+    this.commandInBackticks(lineIndex, command, id)
+  }
+
+  /**
+   * Holds if there is a command substitution `$(command)` in
+   * the line at `lineIndex` in the shell script,
+   * and `id` is a unique identifier for this command.
+   */
+  private predicate commandInSubstitution(int lineIndex, string command, string id) {
+    exists(int occurrenceIndex, int occurrenceOffset |
+      command =
+        // Look for the command inside a $(...) command substitution
+        this.lineProducer(lineIndex)
+            .regexpFind("\\$\\((?:[^()]+|\\((?:[^()]+|\\([^()]*\\))*\\))*\\)", occurrenceIndex,
+              occurrenceOffset)
+            // trim starting $( - TODO do this in first regex
+            .regexpReplaceAll("^\\$\\(", "")
+            // trim ending ) - TODO do this in first regex
+            .regexpReplaceAll("\\)$", "") and
+      id = "cmdsubs:" + lineIndex + ":" + occurrenceIndex + ":" + occurrenceOffset
+    )
+  }
+
+  /**
+   * Holds if `command` is a command in backticks `` `...` `` in
+   * the line at `lineIndex` in the shell script,
+   * and `id` is a unique identifier for this command.
+   */
+  private predicate commandInBackticks(int lineIndex, string command, string id) {
+    exists(int occurrenceIndex, int occurrenceOffset |
+      command =
+        this.lineProducer(lineIndex)
+            .regexpFind("\\`[^\\`]+\\`", occurrenceIndex, occurrenceOffset)
+            // trim leading backtick - TODO do this in first regex
+            .regexpReplaceAll("^\\`", "")
+            // trim trailing backtick - TODO do this in first regex
+            .regexpReplaceAll("\\`$", "") and
+      id = "cmd:" + lineIndex + ":" + occurrenceIndex + ":" + occurrenceOffset
     )
   }
 

From 321513c89bab2e2fbcc8580950fdd36deadbe793 Mon Sep 17 00:00:00 2001
From: Aditya Sharad <adityasharad@github.com>
Date: Mon, 9 Jun 2025 08:39:56 -0700
Subject: [PATCH 2/6] Actions: Order command substitutions by their ID, not
 text

In the Bash parser, we compute a mostly-unique ID for each
command substitution within a shell script block.
Commands are then ranked and referred to individually.

Avoid a performance bottleneck by ranking commands by their
ID, not by their source text.
I think this was the original intent of the code.
Ranking by their original text ends up evaluating multiple
possible orderings, which is slow on workflows that contain
multiple complex command substitutions.
---
 actions/ql/lib/codeql/actions/Bash.qll | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/actions/ql/lib/codeql/actions/Bash.qll b/actions/ql/lib/codeql/actions/Bash.qll
index e5403c5fda72..48fc348fff88 100644
--- a/actions/ql/lib/codeql/actions/Bash.qll
+++ b/actions/ql/lib/codeql/actions/Bash.qll
@@ -60,9 +60,12 @@ class BashShellScript extends ShellScript {
     )
   }
 
-  private predicate rankedCmdSubstitutionReplacements(int i, string old, string new) {
-    old = rank[i](string old2 | this.cmdSubstitutionReplacement(old2, _, _) | old2) and
-    this.cmdSubstitutionReplacement(old, new, _)
+  private predicate rankedCmdSubstitutionReplacements(int i, string command, string commandId) {
+    // rank commands by their unique IDs
+    commandId = rank[i](string c, string id | this.cmdSubstitutionReplacement(c, id, _) | id) and
+    // since we cannot output (command, ID) tuples from the rank operation,
+    // we need to work out the specific command associated with the resulting ID
+    this.cmdSubstitutionReplacement(command, commandId, _)
   }
 
   private predicate doReplaceCmdSubstitutions(int line, int round, string old, string new) {

From fbe11cfca63b4a7417a81cf2058e3d7aa3e82d12 Mon Sep 17 00:00:00 2001
From: Aditya Sharad <adityasharad@github.com>
Date: Mon, 9 Jun 2025 08:43:24 -0700
Subject: [PATCH 3/6] Actions: Refactor logic for identifying quoted strings

Add some doc comments and meaningful variable names.
---
 actions/ql/lib/codeql/actions/Bash.qll | 52 ++++++++++++++++++--------
 1 file changed, 37 insertions(+), 15 deletions(-)

diff --git a/actions/ql/lib/codeql/actions/Bash.qll b/actions/ql/lib/codeql/actions/Bash.qll
index 48fc348fff88..9d2fb867d483 100644
--- a/actions/ql/lib/codeql/actions/Bash.qll
+++ b/actions/ql/lib/codeql/actions/Bash.qll
@@ -93,23 +93,45 @@ class BashShellScript extends ShellScript {
     this.cmdSubstitutionReplacement(result, _, i)
   }
 
+  /**
+   * Holds if `quotedStr` is a string in double quotes in
+   * the line at `lineIndex` in the shell script,
+   * and `id` is a unique identifier for this quoted string.
+   */
+  private predicate doubleQuotedString(int lineIndex, string quotedStr, string id) {
+    exists(int occurrenceIndex, int occurrenceOffset |
+      // double quoted string
+      quotedStr =
+        this.cmdSubstitutedLineProducer(lineIndex)
+            .regexpFind("\"((?:[^\"\\\\]|\\\\.)*)\"", occurrenceIndex, occurrenceOffset) and
+      id =
+        "qstr:" + lineIndex + ":" + occurrenceIndex + ":" + occurrenceOffset + ":" +
+          quotedStr.length() + ":" + quotedStr.regexpReplaceAll("[^a-zA-Z0-9]", "")
+    )
+  }
+
+  /**
+   * Holds if `quotedStr` is a string in single quotes in
+   * the line at `lineIndex` in the shell script,
+   * and `id` is a unique identifier for this quoted string.
+   */
+  private predicate singleQuotedString(int lineIndex, string quotedStr, string id) {
+    exists(int occurrenceIndex, int occurrenceOffset |
+      // single quoted string
+      quotedStr =
+        this.cmdSubstitutedLineProducer(lineIndex)
+            .regexpFind("'((?:\\\\.|[^'\\\\])*)'", occurrenceIndex, occurrenceOffset) and
+      id =
+        "qstr:" + lineIndex + ":" + occurrenceIndex + ":" + occurrenceOffset + ":" +
+          quotedStr.length() + ":" + quotedStr.regexpReplaceAll("[^a-zA-Z0-9]", "")
+    )
+  }
+
   private predicate quotedStringReplacement(string quotedStr, string id) {
-    exists(string line, int k | line = this.cmdSubstitutedLineProducer(k) |
-      exists(int i, int j |
-        // double quoted string
-        quotedStr = line.regexpFind("\"((?:[^\"\\\\]|\\\\.)*)\"", i, j) and
-        id =
-          "qstr:" + k + ":" + i + ":" + j + ":" + quotedStr.length() + ":" +
-            quotedStr.regexpReplaceAll("[^a-zA-Z0-9]", "")
-      )
+    exists(int lineIndex |
+      this.doubleQuotedString(lineIndex, quotedStr, id)
       or
-      exists(int i, int j |
-        // single quoted string
-        quotedStr = line.regexpFind("'((?:\\\\.|[^'\\\\])*)'", i, j) and
-        id =
-          "qstr:" + k + ":" + i + ":" + j + ":" + quotedStr.length() + ":" +
-            quotedStr.regexpReplaceAll("[^a-zA-Z0-9]", "")
-      )
+      this.singleQuotedString(lineIndex, quotedStr, id)
     ) and
     // Only do this for strings that might otherwise disrupt subsequent parsing
     quotedStr.regexpMatch("[\"'].*[$\n\r'\"" + Bash::separator() + "].*[\"']")

From 848064e95a99aa835a6cedc823a88d2360f89d20 Mon Sep 17 00:00:00 2001
From: Aditya Sharad <adityasharad@github.com>
Date: Mon, 9 Jun 2025 08:44:17 -0700
Subject: [PATCH 4/6] Actions: Order quoted strings by their ID, not text

In the Bash parser, we compute a mostly-unique ID for each
quoted string within a shell script block.
Quoted strings are then ranked and referred to individually.

Avoid a performance bottleneck by ranking quoted strings by their
ID, not by their source text.
I think this was the original intent of the code.

Ranking by their original text ends up evaluating multiple
possible orderings, which is slow on workflows that contain
multiple complex quoted strings, such as JSON payloads.
---
 actions/ql/lib/codeql/actions/Bash.qll | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/actions/ql/lib/codeql/actions/Bash.qll b/actions/ql/lib/codeql/actions/Bash.qll
index 9d2fb867d483..4975ce6f4cc5 100644
--- a/actions/ql/lib/codeql/actions/Bash.qll
+++ b/actions/ql/lib/codeql/actions/Bash.qll
@@ -137,9 +137,12 @@ class BashShellScript extends ShellScript {
     quotedStr.regexpMatch("[\"'].*[$\n\r'\"" + Bash::separator() + "].*[\"']")
   }
 
-  private predicate rankedQuotedStringReplacements(int i, string old, string new) {
-    old = rank[i](string old2 | this.quotedStringReplacement(old2, _) | old2) and
-    this.quotedStringReplacement(old, new)
+  private predicate rankedQuotedStringReplacements(int i, string quotedString, string quotedStringId) {
+    // rank quoted strings by their nearly-unique IDs
+    quotedStringId = rank[i](string s, string id | this.quotedStringReplacement(s, id) | id) and
+    // since we cannot output (string, ID) tuples from the rank operation,
+    // we need to work out the specific string associated with the resulting ID
+    this.quotedStringReplacement(quotedString, quotedStringId)
   }
 
   private predicate doReplaceQuotedStrings(int line, int round, string old, string new) {

From 2ecbecbd4b39cbd9ecef1af98442b273baa7a6ff Mon Sep 17 00:00:00 2001
From: Aditya Sharad <adityasharad@github.com>
Date: Mon, 9 Jun 2025 09:29:12 -0700
Subject: [PATCH 5/6] Actions: Add stress test for complex command and string
 interpolation

Anonymised version of a customer report that led to
performance bottlenecks in Bash parsing.
No results are expected from both query and library tests.
---
 .../.github/workflows/interpolation.yml       | 81 +++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 actions/ql/test/query-tests/Security/CWE-094/.github/workflows/interpolation.yml

diff --git a/actions/ql/test/query-tests/Security/CWE-094/.github/workflows/interpolation.yml b/actions/ql/test/query-tests/Security/CWE-094/.github/workflows/interpolation.yml
new file mode 100644
index 000000000000..2b719a3a38ab
--- /dev/null
+++ b/actions/ql/test/query-tests/Security/CWE-094/.github/workflows/interpolation.yml
@@ -0,0 +1,81 @@
+name: Workflow with complex interpolation
+on: 
+  workflow_dispatch:
+    inputs:
+      choice-a:
+        required: true
+        type: choice
+        description: choice-a
+        default: a1
+        options: 
+        - a1
+        - a2
+        - a3
+      string-b:
+        required: false
+        type: string
+        description: string-b
+      string-c:
+        required: false
+        type: string
+        description: string-c
+      list-d:
+        required: true
+        type: string
+        default: d1 d2
+        description: list-d whitespace separated
+      list-e:
+        required: false
+        type: string
+        description: list-e whitespace separated
+      choice-f:
+        required: true
+        type: choice
+        description: choice-f
+        options: 
+        - false
+        - true
+          
+env:
+  DRY_TEST: false
+  B: ${{ github.event.inputs.string-b }}
+
+jobs:
+  job:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Produce values
+      id: produce-values
+      run: |
+        echo "region=region" >> $GITHUB_OUTPUT
+        echo "zone=zone" >> $GITHUB_OUTPUT
+
+    - name: Step with complex interpolation
+      id: complex
+      env:
+        CHOICE_A: ${{ github.event.inputs.choice-a }}
+        STRING_B: ${{ github.event.inputs.string-b }}
+        STRING_C: ${{ github.event.inputs.string-c }}
+        LIST_D: ${{ github.event.inputs.list-d }}
+        LIST_E: ${{ github.event.inputs.list-e }}
+        CHOICE_F: ${{ github.event.inputs.choice-f }}
+        REGION: ${{ steps.produce-values.outputs.region }}
+        ZONE: ${{ steps.produce-values.outputs.zone }}
+        DRY_TEST_JSON: ${{ fromJSON(env.DRY_TEST) }}
+        FUNCTION_NAME: my-function
+        USER_EMAIL: 'example@example.com'
+        TYPE: type
+        RANGE: '0-100'
+
+      run: |
+        comma_separated_list_d=$(echo "${LIST_D}" | sed "s/ /\",\"/g")
+        comma_separated_list_e=$(echo "${LIST_E}" | sed "s/ /\",\"/g")
+        c1=$(echo "${STRING_C}" | cut -d "-" -f 1)
+        c2=$(echo "${STRING_C}" | cut -d "-" -f 2)
+        # Similar commands that use JSON payloads with string interpolation.
+        response=$(aws lambda invoke --invocation-type RequestResponse --function-name "${FUNCTION_NAME}" --region "${REGION}" --cli-read-timeout 0 --cli-binary-format raw-in-base64-out --payload '{"appName":"my-app","chA":"'"${CHOICE_A}"'","c1":"'"${c1}"'","c2":"'"${c2}"'","a":"${CHOICE_A}","bValue":"${B}","zone":"${ZONE}","userEmail":"'"${USER_EMAIL}"'","region":"${REGION}","range":"${RANGE}","type":"${TYPE}","b":"${STRING_B}","listD":"","listE":"","dryTest":'"${DRY_TEST_JSON}"',"f":"${CHOICE_F}"}' ./config.json --log-type Tail)
+        response=$(aws lambda invoke --invocation-type RequestResponse --function-name "${FUNCTION_NAME}" --region "${REGION}" --cli-read-timeout 0 --cli-binary-format raw-in-base64-out --payload '{"appName":"my-app","chA":"'"${CHOICE_A}"'","c1":"'"${c1}"'","c2":"'"${c2}"'","a":"${CHOICE_A}","bValue":"${B}","zone":"${ZONE}","userEmail":"'"${USER_EMAIL}"'","region":"${REGION}","range":"${RANGE}","type":"${TYPE}","b":"${STRING_B}","listD":["'"${comma_separated_list_d}"'"],"listE":"","dryTest":'"${DRY_TEST_JSON}"',"f":"${CHOICE_F}"}' ./config.json --log-type Tail)
+        response=$(aws lambda invoke --invocation-type RequestResponse --function-name "${FUNCTION_NAME}" --region "${REGION}" --cli-read-timeout 0 --cli-binary-format raw-in-base64-out --payload '{"appName":"my-app","chA":"'"${CHOICE_A}"'","c1":"'"${c1}"'","c2":"'"${c2}"'","a":"${CHOICE_A}","bValue":"${B}","zone":"${ZONE}","userEmail":"'"${USER_EMAIL}"'","region":"${REGION}","range":"${RANGE}","type":"${TYPE}","b":"${STRING_B}","listD":["'"${comma_separated_list_d}"'"],"listE":"","dryTest":'"${DRY_TEST_JSON}"',"f":"${CHOICE_F}"}' ./config.json --log-type Tail)
+        response=$(aws lambda invoke --invocation-type RequestResponse --function-name "${FUNCTION_NAME}" --region "${REGION}" --cli-read-timeout 0 --cli-binary-format raw-in-base64-out --payload '{"appName":"my-app","chA":"'"${CHOICE_A}"'","c1":"'"${c1}"'","c2":"'"${c2}"'","a":"${CHOICE_A}","bValue":"${B}","zone":"${ZONE}","userEmail":"'"${USER_EMAIL}"'","region":"${REGION}","range":"${RANGE}","type":"${TYPE}","b":"${STRING_B}","listD":["'"${comma_separated_list_d}"'"],"listE":"","dryTest":'"${DRY_TEST_JSON}"',"f":"${CHOICE_F}"}' ./config.json --log-type Tail)
+        response=$(aws lambda invoke --invocation-type RequestResponse --function-name "${FUNCTION_NAME}" --region "${REGION}" --cli-read-timeout 0 --cli-binary-format raw-in-base64-out --payload '{"appName":"my-app","chA":"'"${CHOICE_A}"'","c1":"'"${c1}"'","c2":"'"${c2}"'","a":"${CHOICE_A}","bValue":"${B}","zone":"${ZONE}","userEmail":"'"${USER_EMAIL}"'","region":"${REGION}","range":"${RANGE}","type":"${TYPE}","b":"${STRING_B}","listD":"","listE":["'"${comma_separated_list_e}"'"],"dryTest":'"${DRY_TEST_JSON}"',"f":"${CHOICE_F}"}' ./config.json --log-type Tail)
+      shell: bash

From e48a7da8274240c84d0511f6c805ec401375ff2e Mon Sep 17 00:00:00 2001
From: Aditya Sharad <adityasharad@github.com>
Date: Mon, 9 Jun 2025 09:53:50 -0700
Subject: [PATCH 6/6] Actions: Add change note for Bash parsing fixes

---
 .../lib/change-notes/2025-06-09-bash-parsing-performance.md | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 actions/ql/lib/change-notes/2025-06-09-bash-parsing-performance.md

diff --git a/actions/ql/lib/change-notes/2025-06-09-bash-parsing-performance.md b/actions/ql/lib/change-notes/2025-06-09-bash-parsing-performance.md
new file mode 100644
index 000000000000..5ee29557c85e
--- /dev/null
+++ b/actions/ql/lib/change-notes/2025-06-09-bash-parsing-performance.md
@@ -0,0 +1,6 @@
+---
+category: minorAnalysis
+---
+* Fixed performance issues in the parsing of Bash scripts in workflow files,
+  which led to out-of-disk errors when analysing certain workflow files with
+  complex interpolations of shell commands or quoted strings.
\ No newline at end of file