diff --git a/python/ql/lib/change-notes/2025-01-15-builtin-model.md b/python/ql/lib/change-notes/2025-01-15-builtin-model.md new file mode 100644 index 000000000000..c7933d09d044 --- /dev/null +++ b/python/ql/lib/change-notes/2025-01-15-builtin-model.md @@ -0,0 +1,4 @@ +--- +category: minorAnalysis +--- +* Additional data flow models for the builtin functions `map`, `filter`, `zip`, and `enumerate` have been added. \ No newline at end of file diff --git a/python/ql/lib/semmle/python/frameworks/Stdlib.qll b/python/ql/lib/semmle/python/frameworks/Stdlib.qll index 45878c8160b2..201354216004 100644 --- a/python/ql/lib/semmle/python/frameworks/Stdlib.qll +++ b/python/ql/lib/semmle/python/frameworks/Stdlib.qll @@ -4523,6 +4523,124 @@ module StdlibPrivate { } } + /** A flow summary for `map`. */ + class MapSummary extends SummarizedCallable { + MapSummary() { this = "builtins.map" } + + override DataFlow::CallCfgNode getACall() { result = API::builtin("map").getACall() } + + override DataFlow::ArgumentNode getACallback() { + result = API::builtin("map").getAValueReachableFromSource() + } + + override predicate propagatesFlow(string input, string output, boolean preservesValue) { + exists(int i | exists(any(Call c).getArg(i)) | + ( + input = "Argument[" + (i + 1).toString() + "].ListElement" + or + input = "Argument[" + (i + 1).toString() + "].SetElement" + or + // We reduce generality slightly by not tracking tuple contents on list arguments beyond the first, for performance. + // TODO: Once we have TupleElementAny, this generality can be increased. + i = 0 and + exists(DataFlow::TupleElementContent tc, int j | j = tc.getIndex() | + input = "Argument[1].TupleElement[" + j.toString() + "]" + ) + // TODO: Once we have DictKeyContent, we need to transform that into ListElementContent + ) and + output = "Argument[0].Parameter[" + i.toString() + "]" and + preservesValue = true + ) + or + input = "Argument[0].ReturnValue" and + output = "ReturnValue.ListElement" and + preservesValue = true + } + } + + /** A flow summary for `filter`. */ + class FilterSummary extends SummarizedCallable { + FilterSummary() { this = "builtins.filter" } + + override DataFlow::CallCfgNode getACall() { result = API::builtin("filter").getACall() } + + override DataFlow::ArgumentNode getACallback() { + result = API::builtin("filter").getAValueReachableFromSource() + } + + override predicate propagatesFlow(string input, string output, boolean preservesValue) { + ( + input = "Argument[1].ListElement" + or + input = "Argument[1].SetElement" + or + exists(DataFlow::TupleElementContent tc, int i | i = tc.getIndex() | + input = "Argument[1].TupleElement[" + i.toString() + "]" + ) + // TODO: Once we have DictKeyContent, we need to transform that into ListElementContent + ) and + (output = "Argument[0].Parameter[0]" or output = "ReturnValue.ListElement") and + preservesValue = true + } + } + + /**A summary for `enumerate`. */ + class EnumerateSummary extends SummarizedCallable { + EnumerateSummary() { this = "builtins.enumerate" } + + override DataFlow::CallCfgNode getACall() { result = API::builtin("enumerate").getACall() } + + override DataFlow::ArgumentNode getACallback() { + result = API::builtin("enumerate").getAValueReachableFromSource() + } + + override predicate propagatesFlow(string input, string output, boolean preservesValue) { + ( + input = "Argument[0].ListElement" + or + input = "Argument[0].SetElement" + or + exists(DataFlow::TupleElementContent tc, int i | i = tc.getIndex() | + input = "Argument[0].TupleElement[" + i.toString() + "]" + ) + // TODO: Once we have DictKeyContent, we need to transform that into ListElementContent + ) and + output = "ReturnValue.ListElement.TupleElement[1]" and + preservesValue = true + } + } + + /** A flow summary for `zip`. */ + class ZipSummary extends SummarizedCallable { + ZipSummary() { this = "builtins.zip" } + + override DataFlow::CallCfgNode getACall() { result = API::builtin("zip").getACall() } + + override DataFlow::ArgumentNode getACallback() { + result = API::builtin("zip").getAValueReachableFromSource() + } + + override predicate propagatesFlow(string input, string output, boolean preservesValue) { + exists(int i | exists(any(Call c).getArg(i)) | + ( + input = "Argument[" + i.toString() + "].ListElement" + or + input = "Argument[" + i.toString() + "].SetElement" + or + // We reduce generality slightly by not tracking tuple contents on arguments beyond the first two, for performance. + // TODO: Once we have TupleElementAny, this generality can be increased. + i in [0 .. 1] and + exists(DataFlow::TupleElementContent tc, int j | j = tc.getIndex() | + input = "Argument[" + i.toString() + "].TupleElement[" + j.toString() + "]" + ) + // TODO: Once we have DictKeyContent, we need to transform that into ListElementContent + ) and + output = "ReturnValue.ListElement.TupleElement[" + i.toString() + "]" and + preservesValue = true + ) + } + } + // --------------------------------------------------------------------------- // Flow summaries for container methods // --------------------------------------------------------------------------- diff --git a/python/ql/test/library-tests/dataflow/coverage/test_builtins.py b/python/ql/test/library-tests/dataflow/coverage/test_builtins.py index 3195e9b5f6d4..d609dbe3ef27 100644 --- a/python/ql/test/library-tests/dataflow/coverage/test_builtins.py +++ b/python/ql/test/library-tests/dataflow/coverage/test_builtins.py @@ -366,3 +366,246 @@ def test_next_dict(): i = iter(d) n = next(i) SINK(n) #$ MISSING: flow="SOURCE, l:-3 -> n" + +### map + +@expects(4) +def test_map_list(): + l1 = [SOURCE] + l2 = [NONSOURCE] + + def f(p1,p2): + SINK(p1) #$ flow="SOURCE, l:-4 -> p1" + SINK_F(p2) + + return p1,p2 + + rl = list(map(f, l1, l2)) + SINK(rl[0][0]) #$ flow="SOURCE, l:-10 -> rl[0][0]" + SINK_F(rl[0][1]) + +@expects(4) +def test_map_set(): + s1 = {SOURCE} + s2 = {NONSOURCE} + + def f(p1,p2): + SINK(p1) #$ flow="SOURCE, l:-4 -> p1" + SINK_F(p2) + + return p1,p2 + + rl = list(map(f, s1, s2)) + SINK(rl[0][0]) #$ flow="SOURCE, l:-10 -> rl[0][0]" + SINK_F(rl[0][1]) + +@expects(4) +def test_map_tuple(): + t1 = (SOURCE,) + t2 = (NONSOURCE,) + + def f(p1,p2): + SINK(p1) #$ flow="SOURCE, l:-4 -> p1" + SINK_F(p2) + + return p1,p2 + + rl = list(map(f, t1, t2)) + SINK(rl[0][0]) #$ flow="SOURCE, l:-10 -> rl[0][0]" + SINK_F(rl[0][1]) + + +@expects(4) +def test_map_dict(): + d1 = {SOURCE: "v1"} + d2 = {NONSOURCE: "v2"} + + def f(p1,p2): + SINK(p1) #$ MISSING: flow="SOURCE, l:-4 -> p1" + SINK_F(p2) + + return p1,p2 + + rl = list(map(f, d1, d2)) + SINK(rl[0][0]) #$ MISSING: flow="SOURCE, l:-10 -> rl[0][0]" + SINK_F(rl[0][1]) + +@expects(4) +def test_map_multi_list(): + l1 = [SOURCE] + l2 = [SOURCE] + + def f(p1,p2): + SINK(p1) #$ flow="SOURCE, l:-4 -> p1" + SINK(p2) #$ flow="SOURCE, l:-4 -> p2" + return p1,p2 + + rl = list(map(f, l1, l2)) + SINK(rl[0][0]) #$ flow="SOURCE, l:-9 -> rl[0][0]" + SINK(rl[0][1]) #$ flow="SOURCE, l:-9 -> rl[0][1]" + +@expects(4) +def test_map_multi_tuple(): + l1 = (SOURCE,) + l2 = (SOURCE,) + + def f(p1,p2): + SINK(p1) #$ flow="SOURCE, l:-4 -> p1" + SINK(p2) #$ MISSING: flow="SOURCE, l:-4 -> p2" # Tuples are not tracked beyond the first list argument for performance. + return p1,p2 + + rl = list(map(f, l1, l2)) + SINK(rl[0][0]) #$ flow="SOURCE, l:-9 -> rl[0][0]" + SINK(rl[0][1]) #$ MISSING: flow="SOURCE, l:-9 -> rl[0][1]" + +### filter + +@expects(2) +def test_filter_list(): + l = [SOURCE] + + def f(p): + SINK(p) #$ flow="SOURCE, l:-3 -> p" + return True + + rl = list(filter(f,l)) + SINK(rl[0]) #$ flow="SOURCE, l:-7 -> rl[0]" + +@expects(2) +def test_filter_set(): + s = {SOURCE} + + def f(p): + SINK(p) #$ flow="SOURCE, l:-3 -> p" + return True + + rl = list(filter(f,s)) + SINK(rl[0]) #$ flow="SOURCE, l:-7 -> rl[0]" + +@expects(2) +def test_filter_tuple(): + t = (SOURCE,) + + def f(p): + SINK(p) #$ flow="SOURCE, l:-3 -> p" + return True + + rl = list(filter(f,t)) + SINK(rl[0]) #$ flow="SOURCE, l:-7 -> rl[0]" + +@expects(2) +def test_filter_dict(): + d = {SOURCE: "v"} + + def f(p): + SINK(p) #$ MISSING: flow="SOURCE, l:-3 -> p" + return True + + rl = list(filter(f,d)) + SINK(rl[0]) #$ MISSING: flow="SOURCE, l:-7 -> rl[0]" + +@expects(1) +def test_enumerate_list(): + l = [SOURCE] + + e = list(enumerate(l)) + + SINK(e[0][1]) #$ flow="SOURCE, l:-4 -> e[0][1]" + +@expects(1) +def test_enumerate_set(): + s = {SOURCE} + + e = list(enumerate(s)) + + SINK(e[0][1]) #$ flow="SOURCE, l:-4 -> e[0][1]" + +@expects(1) +def test_enumerate_tuple(): + t = (SOURCE,) + + e = list(enumerate(t)) + + SINK(e[0][1]) #$ flow="SOURCE, l:-4 -> e[0][1]" + +@expects(2) +def test_enumerate_list_for(): + l = [SOURCE] + + for i, x in enumerate(l): + SINK(x) #$ flow="SOURCE, l:-3 -> x" + + for t in enumerate(l): + SINK(t[1]) #$ flow="SOURCE, l:-6 -> t[1]" + +@expects(1) +def test_enumerate_dict(): + d = {SOURCE:"v"} + + e = list(enumerate(d)) + + SINK(e[0][1]) # $ MISSING: flow="SOURCE, l:-4 -> e[0][1]" + +@expects(8) +def test_zip_list(): + l1 = [SOURCE, SOURCE] + l2 = [SOURCE, NONSOURCE] + l3 = [NONSOURCE, SOURCE] + l4 = [NONSOURCE, NONSOURCE] + + z = list(zip(l1,l2,l3,l4)) + + SINK(z[0][0]) #$ flow="SOURCE, l:-7 -> z[0][0]" + SINK(z[0][1]) #$ flow="SOURCE, l:-7 -> z[0][1]" + SINK_F(z[0][2]) #$ SPURIOUS: flow="SOURCE, l:-7 -> z[0][2]" + SINK_F(z[0][3]) + SINK(z[1][0]) #$ flow="SOURCE, l:-11 -> z[1][0]" + SINK_F(z[1][1]) #$ SPURIOUS: flow="SOURCE, l:-11 -> z[1][1]" + SINK(z[1][2]) #$ flow="SOURCE, l:-11 -> z[1][2]" + SINK_F(z[1][3]) + +@expects(4) +def test_zip_set(): + s1 = {SOURCE} + s2 = {NONSOURCE} + s3 = {SOURCE} + s4 = {NONSOURCE} + + z = list(zip(s1,s2,s3,s4)) + + SINK(z[0][0]) #$ flow="SOURCE, l:-7 -> z[0][0]" + SINK_F(z[0][1]) + SINK(z[0][2]) #$ flow="SOURCE, l:-7 -> z[0][2]" + SINK_F(z[0][3]) + +@expects(8) +def test_zip_tuple(): + t1 = (SOURCE, SOURCE) + t2 = (SOURCE, NONSOURCE) + t3 = (NONSOURCE, SOURCE) + t4 = (NONSOURCE, NONSOURCE) + + z = list(zip(t1,t2,t3,t4)) + + SINK(z[0][0]) #$ flow="SOURCE, l:-7 -> z[0][0]" + SINK(z[0][1]) #$ flow="SOURCE, l:-7 -> z[0][1]" + SINK_F(z[0][2]) + SINK_F(z[0][3]) + SINK(z[1][0]) #$ flow="SOURCE, l:-11 -> z[1][0]" + SINK_F(z[1][1]) #$ SPURIOUS: flow="SOURCE, l:-11 -> z[1][1]" + SINK(z[1][2]) #$ MISSING: flow="SOURCE, l:-11 -> z[1][2]" # Tuple contents are not tracked beyond the first two arguments for performance. + SINK_F(z[1][3]) + +@expects(4) +def test_zip_dict(): + d1 = {SOURCE: "v"} + d2 = {NONSOURCE: "v"} + d3 = {SOURCE: "v"} + d4 = {NONSOURCE: "v"} + + z = list(zip(d1,d2,d3,d4)) + + SINK(z[0][0]) #$ MISSING: flow="SOURCE, l:-7 -> z[0][0]" + SINK_F(z[0][1]) + SINK(z[0][2]) #$ MISSING: flow="SOURCE, l:-7 -> z[0][2]" + SINK_F(z[0][3]) \ No newline at end of file diff --git a/python/ql/test/library-tests/dataflow/variable-capture/test_library_calls.py b/python/ql/test/library-tests/dataflow/variable-capture/test_library_calls.py index 70b07f66557a..5db25fc83486 100644 --- a/python/ql/test/library-tests/dataflow/variable-capture/test_library_calls.py +++ b/python/ql/test/library-tests/dataflow/variable-capture/test_library_calls.py @@ -45,4 +45,4 @@ def set(x): for x in map(set, [1]): pass - SINK(captured["x"]) #$ MISSING: captured + SINK(captured["x"]) #$ captured