From 642f9dcbea250c48ebb5e2f57b759d089878a870 Mon Sep 17 00:00:00 2001 From: Joe Farebrother Date: Mon, 16 Dec 2024 16:35:37 +0000 Subject: [PATCH 1/6] Model missing builtins --- .../lib/semmle/python/frameworks/Stdlib.qll | 112 ++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/python/ql/lib/semmle/python/frameworks/Stdlib.qll b/python/ql/lib/semmle/python/frameworks/Stdlib.qll index 45878c8160b2..087a2b57b2fa 100644 --- a/python/ql/lib/semmle/python/frameworks/Stdlib.qll +++ b/python/ql/lib/semmle/python/frameworks/Stdlib.qll @@ -4523,6 +4523,118 @@ module StdlibPrivate { } } + /** A flow summary for `map`. */ + class MapSummary extends SummarizedCallable { + MapSummary() { this = "builtins.map" } + + override DataFlow::CallCfgNode getACall() { result = API::builtin("map").getACall() } + + override DataFlow::ArgumentNode getACallback() { + result = API::builtin("map").getAValueReachableFromSource() + } + + override predicate propagatesFlow(string input, string output, boolean preservesValue) { + exists(int i | exists(any(Call c).getArg(i)) | + ( + input = "Argument[" + (i + 1).toString() + "].ListElement" + or + input = "Argument[" + (i + 1).toString() + "].SetElement" + or + exists(DataFlow::TupleElementContent tc, int j | j = tc.getIndex() | + input = "Argument[" + (i + 1).toString() + "].TupleElement[" + j.toString() + "]" + ) + // TODO: Once we have DictKeyContent, we need to transform that into ListElementContent + ) and + output = "Argument[0].Parameter[" + i.toString() + "]" and + preservesValue = true + ) + or + input = "Argument[0].ReturnValue" and + output = "ReturnValue.ListElement" and + preservesValue = true + } + } + + /** A flow summary for `filter`. */ + class FilterSummary extends SummarizedCallable { + FilterSummary() { this = "builtins.filter" } + + override DataFlow::CallCfgNode getACall() { result = API::builtin("filter").getACall() } + + override DataFlow::ArgumentNode getACallback() { + result = API::builtin("filter").getAValueReachableFromSource() + } + + override predicate propagatesFlow(string input, string output, boolean preservesValue) { + ( + input = "Argument[1].ListElement" + or + input = "Argument[1].SetElement" + or + exists(DataFlow::TupleElementContent tc, int i | i = tc.getIndex() | + input = "Argument[1].TupleElement[" + i.toString() + "]" + ) + // TODO: Once we have DictKeyContent, we need to transform that into ListElementContent + ) and + output = "Argument[0].Parameter[0]" and + preservesValue = true + } + } + + /**A summary for `enumerate`. */ + class EnumerateSummary extends SummarizedCallable { + EnumerateSummary() { this = "builtins.enumerate" } + + override DataFlow::CallCfgNode getACall() { result = API::builtin("enumerate").getACall() } + + override DataFlow::ArgumentNode getACallback() { + result = API::builtin("enumerate").getAValueReachableFromSource() + } + + override predicate propagatesFlow(string input, string output, boolean preservesValue) { + ( + input = "Argument[0].ListElement" + or + input = "Argument[0].SetElement" + or + exists(DataFlow::TupleElementContent tc, int i | i = tc.getIndex() | + input = "Argument[0].TupleElement[" + i.toString() + "]" + ) + // TODO: Once we have DictKeyContent, we need to transform that into ListElementContent + ) and + output = "ReturnValue.ListElement.TupleElement[1]" and + preservesValue = true + } + } + + /** A flow summary for ``zip`. */ + class ZipSummary extends SummarizedCallable { + ZipSummary() { this = "builtins.zip" } + + override DataFlow::CallCfgNode getACall() { result = API::builtin("zip").getACall() } + + override DataFlow::ArgumentNode getACallback() { + result = API::builtin("zip").getAValueReachableFromSource() + } + + override predicate propagatesFlow(string input, string output, boolean preservesValue) { + exists(int i | exists(any(Call c).getArg(i)) | + ( + input = "Argument[" + i.toString() + "].ListElement" + or + input = "Argument[" + i.toString() + "].SetElement" + or + exists(DataFlow::TupleElementContent tc, int j | j = tc.getIndex() | + input = "Argument[" + i.toString() + "].TupleElement[" + j.toString() + "]" + ) + // TODO: Once we have DictKeyContent, we need to transform that into ListElementContent + ) and + output = "ReturnValue.ListElement.TupleElement[" + i.toString() + "]" and + preservesValue = true + ) + } + } + // --------------------------------------------------------------------------- // Flow summaries for container methods // --------------------------------------------------------------------------- From 4e36008ed9cb821cfe12a890f0336e8d4c53395a Mon Sep 17 00:00:00 2001 From: Joe Farebrother Date: Mon, 6 Jan 2025 09:24:39 +0000 Subject: [PATCH 2/6] Add tests --- .../lib/semmle/python/frameworks/Stdlib.qll | 2 +- .../dataflow/coverage/test_builtins.py | 108 ++++++++++++++++++ .../variable-capture/test_library_calls.py | 2 +- 3 files changed, 110 insertions(+), 2 deletions(-) diff --git a/python/ql/lib/semmle/python/frameworks/Stdlib.qll b/python/ql/lib/semmle/python/frameworks/Stdlib.qll index 087a2b57b2fa..80f20e1438b2 100644 --- a/python/ql/lib/semmle/python/frameworks/Stdlib.qll +++ b/python/ql/lib/semmle/python/frameworks/Stdlib.qll @@ -4576,7 +4576,7 @@ module StdlibPrivate { ) // TODO: Once we have DictKeyContent, we need to transform that into ListElementContent ) and - output = "Argument[0].Parameter[0]" and + (output = "Argument[0].Parameter[0]" or output = "ReturnValue.ListElement") and preservesValue = true } } diff --git a/python/ql/test/library-tests/dataflow/coverage/test_builtins.py b/python/ql/test/library-tests/dataflow/coverage/test_builtins.py index 3195e9b5f6d4..950820853957 100644 --- a/python/ql/test/library-tests/dataflow/coverage/test_builtins.py +++ b/python/ql/test/library-tests/dataflow/coverage/test_builtins.py @@ -366,3 +366,111 @@ def test_next_dict(): i = iter(d) n = next(i) SINK(n) #$ MISSING: flow="SOURCE, l:-3 -> n" + +### map + +@expects(4) +def test_map_list(): + l1 = [SOURCE] + l2 = [NONSOURCE] + + def f(p1,p2): + SINK(p1) #$ flow="SOURCE, l:-4 -> p1" + SINK_F(p2) + + return p1,p2 + + rl = list(map(f, l1, l2)) + SINK(rl[0][0]) #$ flow="SOURCE, l:-10 -> rl[0][0]" + SINK_F(rl[0][1]) + +@expects(4) +def test_map_set(): + s1 = {SOURCE} + s2 = {NONSOURCE} + + def f(p1,p2): + SINK(p1) #$ flow="SOURCE, l:-4 -> p1" + SINK_F(p2) + + return p1,p2 + + rl = list(map(f, s1, s2)) + SINK(rl[0][0]) #$ flow="SOURCE, l:-10 -> rl[0][0]" + SINK_F(rl[0][1]) + +@expects(4) +def test_map_tuple(): + t1 = (SOURCE,) + t2 = (NONSOURCE,) + + def f(p1,p2): + SINK(p1) #$ flow="SOURCE, l:-4 -> p1" + SINK_F(p2) + + return p1,p2 + + rl = list(map(f, t1, t2)) + SINK(rl[0][0]) #$ flow="SOURCE, l:-10 -> rl[0][0]" + SINK_F(rl[0][1]) + +@expects(4) +def test_map_dict(): + d1 = {SOURCE: "v1"} + d2 = {NONSOURCE: "v2"} + + def f(p1,p2): + SINK(p1) #$ MISSING: flow="SOURCE, l:-4 -> p1" + SINK_F(p2) + + return p1,p2 + + rl = list(map(f, d1, d2)) + SINK(rl[0][0]) #$ MISSING: flow="SOURCE, l:-10 -> rl[0][0]" + SINK_F(rl[0][1]) + +### filter + +@expects(2) +def test_filter_list(): + l = [SOURCE] + + def f(p): + SINK(p) #$ flow="SOURCE, l:-3 -> p" + return True + + rl = list(filter(f,l)) + SINK(rl[0]) #$ flow="SOURCE, l:-7 -> rl[0]" + +@expects(2) +def test_filter_set(): + s = {SOURCE} + + def f(p): + SINK(p) #$ flow="SOURCE, l:-3 -> p" + return True + + rl = list(filter(f,s)) + SINK(rl[0]) #$ flow="SOURCE, l:-7 -> rl[0]" + +@expects(2) +def test_filter_tuple(): + t = (SOURCE,) + + def f(p): + SINK(p) #$ flow="SOURCE, l:-3 -> p" + return True + + rl = list(filter(f,t)) + SINK(rl[0]) #$ flow="SOURCE, l:-7 -> rl[0]" + +@expects(2) +def test_filter_dict(): + d = {SOURCE: "v"} + + def f(p): + SINK(p) #$ MISSING: flow="SOURCE, l:-3 -> p" + return True + + rl = list(filter(f,d)) + SINK(rl[0]) #$ MISSING: flow="SOURCE, l:-7 -> rl[0]" diff --git a/python/ql/test/library-tests/dataflow/variable-capture/test_library_calls.py b/python/ql/test/library-tests/dataflow/variable-capture/test_library_calls.py index 70b07f66557a..5db25fc83486 100644 --- a/python/ql/test/library-tests/dataflow/variable-capture/test_library_calls.py +++ b/python/ql/test/library-tests/dataflow/variable-capture/test_library_calls.py @@ -45,4 +45,4 @@ def set(x): for x in map(set, [1]): pass - SINK(captured["x"]) #$ MISSING: captured + SINK(captured["x"]) #$ captured From 460de3f7d5d8c58a517615cf8a097f27a20efcff Mon Sep 17 00:00:00 2001 From: Joe Farebrother Date: Tue, 14 Jan 2025 09:38:34 +0000 Subject: [PATCH 3/6] Reduce generality of map and zip for performance --- .../lib/semmle/python/frameworks/Stdlib.qll | 8 +++- .../dataflow/coverage/test_builtins.py | 38 +++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/python/ql/lib/semmle/python/frameworks/Stdlib.qll b/python/ql/lib/semmle/python/frameworks/Stdlib.qll index 80f20e1438b2..e69bebb36ee1 100644 --- a/python/ql/lib/semmle/python/frameworks/Stdlib.qll +++ b/python/ql/lib/semmle/python/frameworks/Stdlib.qll @@ -4540,8 +4540,11 @@ module StdlibPrivate { or input = "Argument[" + (i + 1).toString() + "].SetElement" or + // We reduce generality slightly by not tracking tuple contents on list arguments beyond the first, for performance. + // TODO: Once we have TupleElementAny, this generality can be increased. + i = 0 and exists(DataFlow::TupleElementContent tc, int j | j = tc.getIndex() | - input = "Argument[" + (i + 1).toString() + "].TupleElement[" + j.toString() + "]" + input = "Argument[1].TupleElement[" + j.toString() + "]" ) // TODO: Once we have DictKeyContent, we need to transform that into ListElementContent ) and @@ -4624,6 +4627,9 @@ module StdlibPrivate { or input = "Argument[" + i.toString() + "].SetElement" or + // We reduce generality slightly by not tracking tuple contents on arguments beyond the first two, for performance. + // TODO: Once we have TupleElementAny, this generality can be increased. + i in [0 .. 1] and exists(DataFlow::TupleElementContent tc, int j | j = tc.getIndex() | input = "Argument[" + i.toString() + "].TupleElement[" + j.toString() + "]" ) diff --git a/python/ql/test/library-tests/dataflow/coverage/test_builtins.py b/python/ql/test/library-tests/dataflow/coverage/test_builtins.py index 950820853957..2889c4d64006 100644 --- a/python/ql/test/library-tests/dataflow/coverage/test_builtins.py +++ b/python/ql/test/library-tests/dataflow/coverage/test_builtins.py @@ -414,6 +414,7 @@ def f(p1,p2): SINK(rl[0][0]) #$ flow="SOURCE, l:-10 -> rl[0][0]" SINK_F(rl[0][1]) + @expects(4) def test_map_dict(): d1 = {SOURCE: "v1"} @@ -429,6 +430,35 @@ def f(p1,p2): SINK(rl[0][0]) #$ MISSING: flow="SOURCE, l:-10 -> rl[0][0]" SINK_F(rl[0][1]) +@expects(6) +def test_map_multi_list(): + l1 = [SOURCE] + l2 = [SOURCE] + + def f(p1,p2): + SINK(p1) #$ flow="SOURCE, l:-4 -> p1" + SINK(p2) #$ flow="SOURCE, l:-4 -> p2" + + return p1,p2 + + rl = list(map(f, l1, l2)) + SINK(rl[0][0]) #$ flow="SOURCE, l:-10 -> rl[0][0]" + SINK(rl[0][1]) #$ flow="SOURCE, l:-10 -> rl[0][1]" + +@expects(6) +def test_map_multi_tuple(): + l1 = (SOURCE,) + l2 = (SOURCE,) + + def f(p1,p2): + SINK(p1) #$ flow="SOURCE, l:-4 -> p1" + SINK(p2) #$ MISSING: flow="SOURCE, l:-4 -> p2" # Tuples are not tracked beyond the first list argument for performance. + return p1,p2 + + rl = list(map(f, l1, l2)) + SINK(rl[0][0]) #$ flow="SOURCE, l:-10 -> rl[0][0]" + SINK(rl[0][1]) #$ MISSING: flow="SOURCE, l:-10 -> rl[0][1]" + ### filter @expects(2) @@ -474,3 +504,11 @@ def f(p): rl = list(filter(f,d)) SINK(rl[0]) #$ MISSING: flow="SOURCE, l:-7 -> rl[0]" + +def test_enumerate_list(): + # TODO + pass + +def test_zip_list(): + # TODO + pass \ No newline at end of file From 6a6585e415c05e6869e16d6167192cae9c511b55 Mon Sep 17 00:00:00 2001 From: Joe Farebrother Date: Tue, 14 Jan 2025 17:11:24 +0000 Subject: [PATCH 4/6] Add tests for zip and enumerate --- .../dataflow/coverage/test_builtins.py | 119 ++++++++++++++++-- 1 file changed, 108 insertions(+), 11 deletions(-) diff --git a/python/ql/test/library-tests/dataflow/coverage/test_builtins.py b/python/ql/test/library-tests/dataflow/coverage/test_builtins.py index 2889c4d64006..12193f970824 100644 --- a/python/ql/test/library-tests/dataflow/coverage/test_builtins.py +++ b/python/ql/test/library-tests/dataflow/coverage/test_builtins.py @@ -430,7 +430,7 @@ def f(p1,p2): SINK(rl[0][0]) #$ MISSING: flow="SOURCE, l:-10 -> rl[0][0]" SINK_F(rl[0][1]) -@expects(6) +@expects(4) def test_map_multi_list(): l1 = [SOURCE] l2 = [SOURCE] @@ -438,14 +438,13 @@ def test_map_multi_list(): def f(p1,p2): SINK(p1) #$ flow="SOURCE, l:-4 -> p1" SINK(p2) #$ flow="SOURCE, l:-4 -> p2" - return p1,p2 rl = list(map(f, l1, l2)) - SINK(rl[0][0]) #$ flow="SOURCE, l:-10 -> rl[0][0]" - SINK(rl[0][1]) #$ flow="SOURCE, l:-10 -> rl[0][1]" + SINK(rl[0][0]) #$ flow="SOURCE, l:-9 -> rl[0][0]" + SINK(rl[0][1]) #$ flow="SOURCE, l:-9 -> rl[0][1]" -@expects(6) +@expects(4) def test_map_multi_tuple(): l1 = (SOURCE,) l2 = (SOURCE,) @@ -456,8 +455,8 @@ def f(p1,p2): return p1,p2 rl = list(map(f, l1, l2)) - SINK(rl[0][0]) #$ flow="SOURCE, l:-10 -> rl[0][0]" - SINK(rl[0][1]) #$ MISSING: flow="SOURCE, l:-10 -> rl[0][1]" + SINK(rl[0][0]) #$ flow="SOURCE, l:-9 -> rl[0][0]" + SINK(rl[0][1]) #$ MISSING: flow="SOURCE, l:-9 -> rl[0][1]" ### filter @@ -505,10 +504,108 @@ def f(p): rl = list(filter(f,d)) SINK(rl[0]) #$ MISSING: flow="SOURCE, l:-7 -> rl[0]" +@expects(1) def test_enumerate_list(): - # TODO - pass + l = [SOURCE] + + e = list(enumerate(l)) + + SINK(e[0][1]) #$ flow="SOURCE, l:-4 -> e[0][1]" + +@expects(1) +def test_enumerate_set(): + s = {SOURCE} + + e = list(enumerate(s)) + + SINK(e[0][1]) #$ flow="SOURCE, l:-4 -> e[0][1]" + +@expects(1) +def test_enumerate_tuple(): + t = (SOURCE,) + + e = list(enumerate(t)) + SINK(e[0][1]) #$ flow="SOURCE, l:-4 -> e[0][1]" + +@expects(2) +def test_enumerate_list_for(): + l = [SOURCE] + + for i, x in enumerate(l): + SINK(x) #$ flow="SOURCE, l:-3 -> x" + + for t in enumerate(l): + SINK(t[1]) #$ flow="SOURCE, l:-6 -> t[1]" + +@expects(1) +def test_enumerate_dict(): + d = {SOURCE:"v"} + + e = list(enumerate(d)) + + SINK(e[0][1]) # $ MISSING: flow="SOURCE, l:-4 -> e[0][1]" + +@expects(8) def test_zip_list(): - # TODO - pass \ No newline at end of file + l1 = [SOURCE, SOURCE] + l2 = [SOURCE, NONSOURCE] + l3 = [NONSOURCE, SOURCE] + l4 = [NONSOURCE, NONSOURCE] + + z = list(zip(l1,l2,l3,l4)) + + SINK(z[0][0]) #$ flow="SOURCE, l:-7 -> z[0][0]" + SINK(z[0][1]) #$ flow="SOURCE, l:-7 -> z[0][1]" + SINK_F(z[0][2]) #$ SPURIOUS: flow="SOURCE, l:-7 -> z[0][2]" + SINK_F(z[0][3]) + SINK(z[1][0]) #$ flow="SOURCE, l:-11 -> z[1][0]" + SINK_F(z[1][1]) #$ SPURIOUS: flow="SOURCE, l:-11 -> z[1][1]" + SINK(z[1][2]) #$ flow="SOURCE, l:-11 -> z[1][2]" + SINK_F(z[1][3]) + +@expects(4) +def test_zip_set(): + s1 = {SOURCE} + s2 = {NONSOURCE} + s3 = {SOURCE} + s4 = {NONSOURCE} + + z = list(zip(s1,s2,s3,s4)) + + SINK(z[0][0]) #$ flow="SOURCE, l:-11 -> z[0][0]" + SINK_F(z[0][1]) + SINK(z[0][2]) #$ flow="SOURCE, l:-11 -> z[0][2]" + SINK_F(z[0][3]) + +@expects(8) +def test_zip_tuple(): + t1 = (SOURCE, SOURCE) + t2 = (SOURCE, NONSOURCE) + t3 = (NONSOURCE, SOURCE) + t4 = (NONSOURCE, NONSOURCE) + + z = list(zip(t1,t2,t3,t4)) + + SINK(z[0][0]) #$ flow="SOURCE, l:-7 -> z[0][0]" + SINK(z[0][1]) #$ flow="SOURCE, l:-7 -> z[0][1]" + SINK_F(z[0][2]) + SINK_F(z[0][3]) + SINK(z[1][0]) #$ flow="SOURCE, l:-11 -> z[1][0]" + SINK_F(z[1][1]) #$ SPURIOUS: flow="SOURCE, l:-11 -> z[1][1]" + SINK(z[1][2]) #$ MISSING: flow="SOURCE, l:-11 -> z[1][2]" # Tuple contents are not tracked beyond the first two arguments for performance. + SINK_F(z[1][3]) + +@expects(4) +def test_zip_dict(): + d1 = {SOURCE: "v"} + d2 = {NONSOURCE: "v"} + d3 = {SOURCE: "v"} + d4 = {NONSOURCE: "v"} + + z = list(zip(d1,d2,d3,d4)) + + SINK(z[0][0]) #$ MISSING: flow="SOURCE, l:-11 -> z[0][0]" + SINK_F(z[0][1]) + SINK(z[0][2]) #$ MISSING: flow="SOURCE, l:-11 -> z[0][2]" + SINK_F(z[0][3]) \ No newline at end of file From 2aea356756c8cd5e943ad95fc8eda4e4d41ff19d Mon Sep 17 00:00:00 2001 From: Joe Farebrother Date: Wed, 15 Jan 2025 10:08:48 +0000 Subject: [PATCH 5/6] Add change note + fix tests --- python/ql/lib/change-notes/2025-01-15-builtin-model.md | 4 ++++ .../test/library-tests/dataflow/coverage/test_builtins.py | 8 ++++---- 2 files changed, 8 insertions(+), 4 deletions(-) create mode 100644 python/ql/lib/change-notes/2025-01-15-builtin-model.md diff --git a/python/ql/lib/change-notes/2025-01-15-builtin-model.md b/python/ql/lib/change-notes/2025-01-15-builtin-model.md new file mode 100644 index 000000000000..c7933d09d044 --- /dev/null +++ b/python/ql/lib/change-notes/2025-01-15-builtin-model.md @@ -0,0 +1,4 @@ +--- +category: minorAnalysis +--- +* Additional data flow models for the builtin functions `map`, `filter`, `zip`, and `enumerate` have been added. \ No newline at end of file diff --git a/python/ql/test/library-tests/dataflow/coverage/test_builtins.py b/python/ql/test/library-tests/dataflow/coverage/test_builtins.py index 12193f970824..d609dbe3ef27 100644 --- a/python/ql/test/library-tests/dataflow/coverage/test_builtins.py +++ b/python/ql/test/library-tests/dataflow/coverage/test_builtins.py @@ -573,9 +573,9 @@ def test_zip_set(): z = list(zip(s1,s2,s3,s4)) - SINK(z[0][0]) #$ flow="SOURCE, l:-11 -> z[0][0]" + SINK(z[0][0]) #$ flow="SOURCE, l:-7 -> z[0][0]" SINK_F(z[0][1]) - SINK(z[0][2]) #$ flow="SOURCE, l:-11 -> z[0][2]" + SINK(z[0][2]) #$ flow="SOURCE, l:-7 -> z[0][2]" SINK_F(z[0][3]) @expects(8) @@ -605,7 +605,7 @@ def test_zip_dict(): z = list(zip(d1,d2,d3,d4)) - SINK(z[0][0]) #$ MISSING: flow="SOURCE, l:-11 -> z[0][0]" + SINK(z[0][0]) #$ MISSING: flow="SOURCE, l:-7 -> z[0][0]" SINK_F(z[0][1]) - SINK(z[0][2]) #$ MISSING: flow="SOURCE, l:-11 -> z[0][2]" + SINK(z[0][2]) #$ MISSING: flow="SOURCE, l:-7 -> z[0][2]" SINK_F(z[0][3]) \ No newline at end of file From 344dd2dab58cd8111c00797335e6e883057151cb Mon Sep 17 00:00:00 2001 From: Joe Farebrother Date: Wed, 15 Jan 2025 10:26:37 +0000 Subject: [PATCH 6/6] Qldoc fix --- python/ql/lib/semmle/python/frameworks/Stdlib.qll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ql/lib/semmle/python/frameworks/Stdlib.qll b/python/ql/lib/semmle/python/frameworks/Stdlib.qll index e69bebb36ee1..201354216004 100644 --- a/python/ql/lib/semmle/python/frameworks/Stdlib.qll +++ b/python/ql/lib/semmle/python/frameworks/Stdlib.qll @@ -4610,7 +4610,7 @@ module StdlibPrivate { } } - /** A flow summary for ``zip`. */ + /** A flow summary for `zip`. */ class ZipSummary extends SummarizedCallable { ZipSummary() { this = "builtins.zip" }