Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions python/ql/lib/change-notes/2025-01-15-builtin-model.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
category: minorAnalysis
---
* Additional data flow models for the builtin functions `map`, `filter`, `zip`, and `enumerate` have been added.
118 changes: 118 additions & 0 deletions python/ql/lib/semmle/python/frameworks/Stdlib.qll
Original file line number Diff line number Diff line change
Expand Up @@ -4523,6 +4523,124 @@ module StdlibPrivate {
}
}

/** A flow summary for `map`. */
class MapSummary extends SummarizedCallable {
MapSummary() { this = "builtins.map" }

override DataFlow::CallCfgNode getACall() { result = API::builtin("map").getACall() }

override DataFlow::ArgumentNode getACallback() {
result = API::builtin("map").getAValueReachableFromSource()
}

override predicate propagatesFlow(string input, string output, boolean preservesValue) {
exists(int i | exists(any(Call c).getArg(i)) |
(
input = "Argument[" + (i + 1).toString() + "].ListElement"
or
input = "Argument[" + (i + 1).toString() + "].SetElement"
or
// We reduce generality slightly by not tracking tuple contents on list arguments beyond the first, for performance.
// TODO: Once we have TupleElementAny, this generality can be increased.
i = 0 and
exists(DataFlow::TupleElementContent tc, int j | j = tc.getIndex() |
input = "Argument[1].TupleElement[" + j.toString() + "]"
)
// TODO: Once we have DictKeyContent, we need to transform that into ListElementContent
) and
output = "Argument[0].Parameter[" + i.toString() + "]" and
preservesValue = true
)
or
input = "Argument[0].ReturnValue" and
output = "ReturnValue.ListElement" and
preservesValue = true
}
}

/** A flow summary for `filter`. */
class FilterSummary extends SummarizedCallable {
FilterSummary() { this = "builtins.filter" }

override DataFlow::CallCfgNode getACall() { result = API::builtin("filter").getACall() }

override DataFlow::ArgumentNode getACallback() {
result = API::builtin("filter").getAValueReachableFromSource()
}

override predicate propagatesFlow(string input, string output, boolean preservesValue) {
(
input = "Argument[1].ListElement"
or
input = "Argument[1].SetElement"
or
exists(DataFlow::TupleElementContent tc, int i | i = tc.getIndex() |
input = "Argument[1].TupleElement[" + i.toString() + "]"
)
// TODO: Once we have DictKeyContent, we need to transform that into ListElementContent
) and
(output = "Argument[0].Parameter[0]" or output = "ReturnValue.ListElement") and
preservesValue = true
}
}

/**A summary for `enumerate`. */
class EnumerateSummary extends SummarizedCallable {
EnumerateSummary() { this = "builtins.enumerate" }

override DataFlow::CallCfgNode getACall() { result = API::builtin("enumerate").getACall() }

override DataFlow::ArgumentNode getACallback() {
result = API::builtin("enumerate").getAValueReachableFromSource()
}

override predicate propagatesFlow(string input, string output, boolean preservesValue) {
(
input = "Argument[0].ListElement"
or
input = "Argument[0].SetElement"
or
exists(DataFlow::TupleElementContent tc, int i | i = tc.getIndex() |
input = "Argument[0].TupleElement[" + i.toString() + "]"
)
// TODO: Once we have DictKeyContent, we need to transform that into ListElementContent
) and
output = "ReturnValue.ListElement.TupleElement[1]" and
preservesValue = true
}
}

/** A flow summary for `zip`. */
class ZipSummary extends SummarizedCallable {
ZipSummary() { this = "builtins.zip" }

override DataFlow::CallCfgNode getACall() { result = API::builtin("zip").getACall() }

override DataFlow::ArgumentNode getACallback() {
result = API::builtin("zip").getAValueReachableFromSource()
}

override predicate propagatesFlow(string input, string output, boolean preservesValue) {
exists(int i | exists(any(Call c).getArg(i)) |
(
input = "Argument[" + i.toString() + "].ListElement"
or
input = "Argument[" + i.toString() + "].SetElement"
or
// We reduce generality slightly by not tracking tuple contents on arguments beyond the first two, for performance.
// TODO: Once we have TupleElementAny, this generality can be increased.
i in [0 .. 1] and
exists(DataFlow::TupleElementContent tc, int j | j = tc.getIndex() |
input = "Argument[" + i.toString() + "].TupleElement[" + j.toString() + "]"
)
// TODO: Once we have DictKeyContent, we need to transform that into ListElementContent
) and
output = "ReturnValue.ListElement.TupleElement[" + i.toString() + "]" and
preservesValue = true
)
}
}

// ---------------------------------------------------------------------------
// Flow summaries for container methods
// ---------------------------------------------------------------------------
Expand Down
243 changes: 243 additions & 0 deletions python/ql/test/library-tests/dataflow/coverage/test_builtins.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,3 +366,246 @@ def test_next_dict():
i = iter(d)
n = next(i)
SINK(n) #$ MISSING: flow="SOURCE, l:-3 -> n"

### map

@expects(4)
def test_map_list():
l1 = [SOURCE]
l2 = [NONSOURCE]

def f(p1,p2):
SINK(p1) #$ flow="SOURCE, l:-4 -> p1"
SINK_F(p2)

return p1,p2

rl = list(map(f, l1, l2))
SINK(rl[0][0]) #$ flow="SOURCE, l:-10 -> rl[0][0]"
SINK_F(rl[0][1])

@expects(4)
def test_map_set():
s1 = {SOURCE}
s2 = {NONSOURCE}

def f(p1,p2):
SINK(p1) #$ flow="SOURCE, l:-4 -> p1"
SINK_F(p2)

return p1,p2

rl = list(map(f, s1, s2))
SINK(rl[0][0]) #$ flow="SOURCE, l:-10 -> rl[0][0]"
SINK_F(rl[0][1])

@expects(4)
def test_map_tuple():
t1 = (SOURCE,)
t2 = (NONSOURCE,)

def f(p1,p2):
SINK(p1) #$ flow="SOURCE, l:-4 -> p1"
SINK_F(p2)

return p1,p2

rl = list(map(f, t1, t2))
SINK(rl[0][0]) #$ flow="SOURCE, l:-10 -> rl[0][0]"
SINK_F(rl[0][1])


@expects(4)
def test_map_dict():
d1 = {SOURCE: "v1"}
d2 = {NONSOURCE: "v2"}

def f(p1,p2):
SINK(p1) #$ MISSING: flow="SOURCE, l:-4 -> p1"
SINK_F(p2)

return p1,p2

rl = list(map(f, d1, d2))
SINK(rl[0][0]) #$ MISSING: flow="SOURCE, l:-10 -> rl[0][0]"
SINK_F(rl[0][1])

@expects(4)
def test_map_multi_list():
l1 = [SOURCE]
l2 = [SOURCE]

def f(p1,p2):
SINK(p1) #$ flow="SOURCE, l:-4 -> p1"
SINK(p2) #$ flow="SOURCE, l:-4 -> p2"
return p1,p2

rl = list(map(f, l1, l2))
SINK(rl[0][0]) #$ flow="SOURCE, l:-9 -> rl[0][0]"
SINK(rl[0][1]) #$ flow="SOURCE, l:-9 -> rl[0][1]"

@expects(4)
def test_map_multi_tuple():
l1 = (SOURCE,)
l2 = (SOURCE,)

def f(p1,p2):
SINK(p1) #$ flow="SOURCE, l:-4 -> p1"
SINK(p2) #$ MISSING: flow="SOURCE, l:-4 -> p2" # Tuples are not tracked beyond the first list argument for performance.
return p1,p2

rl = list(map(f, l1, l2))
SINK(rl[0][0]) #$ flow="SOURCE, l:-9 -> rl[0][0]"
SINK(rl[0][1]) #$ MISSING: flow="SOURCE, l:-9 -> rl[0][1]"

### filter

@expects(2)
def test_filter_list():
l = [SOURCE]

def f(p):
SINK(p) #$ flow="SOURCE, l:-3 -> p"
return True

rl = list(filter(f,l))
SINK(rl[0]) #$ flow="SOURCE, l:-7 -> rl[0]"

@expects(2)
def test_filter_set():
s = {SOURCE}

def f(p):
SINK(p) #$ flow="SOURCE, l:-3 -> p"
return True

rl = list(filter(f,s))
SINK(rl[0]) #$ flow="SOURCE, l:-7 -> rl[0]"

@expects(2)
def test_filter_tuple():
t = (SOURCE,)

def f(p):
SINK(p) #$ flow="SOURCE, l:-3 -> p"
return True

rl = list(filter(f,t))
SINK(rl[0]) #$ flow="SOURCE, l:-7 -> rl[0]"

@expects(2)
def test_filter_dict():
d = {SOURCE: "v"}

def f(p):
SINK(p) #$ MISSING: flow="SOURCE, l:-3 -> p"
return True

rl = list(filter(f,d))
SINK(rl[0]) #$ MISSING: flow="SOURCE, l:-7 -> rl[0]"

@expects(1)
def test_enumerate_list():
l = [SOURCE]

e = list(enumerate(l))

SINK(e[0][1]) #$ flow="SOURCE, l:-4 -> e[0][1]"

@expects(1)
def test_enumerate_set():
s = {SOURCE}

e = list(enumerate(s))

SINK(e[0][1]) #$ flow="SOURCE, l:-4 -> e[0][1]"

@expects(1)
def test_enumerate_tuple():
t = (SOURCE,)

e = list(enumerate(t))

SINK(e[0][1]) #$ flow="SOURCE, l:-4 -> e[0][1]"

@expects(2)
def test_enumerate_list_for():
l = [SOURCE]

for i, x in enumerate(l):
SINK(x) #$ flow="SOURCE, l:-3 -> x"

for t in enumerate(l):
SINK(t[1]) #$ flow="SOURCE, l:-6 -> t[1]"

@expects(1)
def test_enumerate_dict():
d = {SOURCE:"v"}

e = list(enumerate(d))

SINK(e[0][1]) # $ MISSING: flow="SOURCE, l:-4 -> e[0][1]"

@expects(8)
def test_zip_list():
l1 = [SOURCE, SOURCE]
l2 = [SOURCE, NONSOURCE]
l3 = [NONSOURCE, SOURCE]
l4 = [NONSOURCE, NONSOURCE]

z = list(zip(l1,l2,l3,l4))

SINK(z[0][0]) #$ flow="SOURCE, l:-7 -> z[0][0]"
SINK(z[0][1]) #$ flow="SOURCE, l:-7 -> z[0][1]"
SINK_F(z[0][2]) #$ SPURIOUS: flow="SOURCE, l:-7 -> z[0][2]"
SINK_F(z[0][3])
SINK(z[1][0]) #$ flow="SOURCE, l:-11 -> z[1][0]"
SINK_F(z[1][1]) #$ SPURIOUS: flow="SOURCE, l:-11 -> z[1][1]"
SINK(z[1][2]) #$ flow="SOURCE, l:-11 -> z[1][2]"
SINK_F(z[1][3])

@expects(4)
def test_zip_set():
s1 = {SOURCE}
s2 = {NONSOURCE}
s3 = {SOURCE}
s4 = {NONSOURCE}

z = list(zip(s1,s2,s3,s4))

SINK(z[0][0]) #$ flow="SOURCE, l:-7 -> z[0][0]"
SINK_F(z[0][1])
SINK(z[0][2]) #$ flow="SOURCE, l:-7 -> z[0][2]"
SINK_F(z[0][3])

@expects(8)
def test_zip_tuple():
t1 = (SOURCE, SOURCE)
t2 = (SOURCE, NONSOURCE)
t3 = (NONSOURCE, SOURCE)
t4 = (NONSOURCE, NONSOURCE)

z = list(zip(t1,t2,t3,t4))

SINK(z[0][0]) #$ flow="SOURCE, l:-7 -> z[0][0]"
SINK(z[0][1]) #$ flow="SOURCE, l:-7 -> z[0][1]"
SINK_F(z[0][2])
SINK_F(z[0][3])
SINK(z[1][0]) #$ flow="SOURCE, l:-11 -> z[1][0]"
SINK_F(z[1][1]) #$ SPURIOUS: flow="SOURCE, l:-11 -> z[1][1]"
SINK(z[1][2]) #$ MISSING: flow="SOURCE, l:-11 -> z[1][2]" # Tuple contents are not tracked beyond the first two arguments for performance.
SINK_F(z[1][3])

@expects(4)
def test_zip_dict():
d1 = {SOURCE: "v"}
d2 = {NONSOURCE: "v"}
d3 = {SOURCE: "v"}
d4 = {NONSOURCE: "v"}

z = list(zip(d1,d2,d3,d4))

SINK(z[0][0]) #$ MISSING: flow="SOURCE, l:-7 -> z[0][0]"
SINK_F(z[0][1])
SINK(z[0][2]) #$ MISSING: flow="SOURCE, l:-7 -> z[0][2]"
SINK_F(z[0][3])
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,4 @@ def set(x):
for x in map(set, [1]):
pass

SINK(captured["x"]) #$ MISSING: captured
SINK(captured["x"]) #$ captured