Skip to content

Commit 32f2614

Browse files
author
Stephan Brandauer
committed
add typecheckable mechanism to enforce minimal set of metadata
1 parent 6d29273 commit 32f2614

File tree

5 files changed

+85
-100
lines changed

5 files changed

+85
-100
lines changed

java/ql/src/Telemetry/AutomodelExtractCandidates.ql

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,11 @@
1212
* @tags internal automodel extract candidates
1313
*/
1414

15-
import AutomodelEndpointCharacteristics
15+
private import AutomodelFrameworkModeCharacteristics
1616

17-
from Endpoint endpoint, string message
17+
from
18+
Endpoint endpoint, string message, MetadataExtractor meta, string package, string type,
19+
boolean subtypes, string name, string signature, int input
1820
where
1921
not exists(CharacteristicsImpl::UninterestingToModelCharacteristic u |
2022
u.appliesToEndpoint(endpoint)
@@ -25,18 +27,20 @@ where
2527
// overlap between our detected sinks and the pre-existing modeling. We assume that, if a sink has already been
2628
// modeled in a MaD model, then it doesn't belong to any additional sink types, and we don't need to reexamine it.
2729
not CharacteristicsImpl::isSink(endpoint, _) and
30+
meta.hasMetadata(endpoint, package, type, subtypes, name, signature, input) and
2831
// The message is the concatenation of all sink types for which this endpoint is known neither to be a sink nor to be
2932
// a non-sink, and we surface only endpoints that have at least one such sink type.
3033
message =
3134
strictconcat(AutomodelEndpointTypes::SinkType sinkType |
32-
not CharacteristicsImpl::isKnownSink(endpoint, sinkType) and
33-
CharacteristicsImpl::isSinkCandidate(endpoint, sinkType)
34-
|
35-
sinkType + ", "
36-
) + "\n" +
37-
// Extract the needed metadata for this endpoint.
38-
any(string metadata | CharacteristicsImpl::hasMetadata(endpoint, metadata))
39-
select endpoint, message + "\nrelated locations: $@, $@.", //
35+
not CharacteristicsImpl::isKnownSink(endpoint, sinkType) and
36+
CharacteristicsImpl::isSinkCandidate(endpoint, sinkType)
37+
|
38+
sinkType + ", "
39+
)
40+
select endpoint,
41+
message + "\nrelated locations: $@, $@." + "\nmetadata: $@, $@, $@, $@, $@, $@.", //
4042
CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, "Callable-JavaDoc"),
41-
"Callable-JavaDoc", //
42-
CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, "Class-JavaDoc"), "Class-JavaDoc" //
43+
"Callable-JavaDoc", CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, "Class-JavaDoc"),
44+
"Class-JavaDoc", //
45+
package, "package", type, "type", subtypes.toString(), "subtypes", name, "name", signature,
46+
"signature", input.toString(), "input" //

java/ql/src/Telemetry/AutomodelExtractNegativeExamples.ql

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,21 @@
88
* @tags internal automodel extract examples negative
99
*/
1010

11-
import AutomodelEndpointCharacteristics
12-
import AutomodelEndpointTypes
11+
private import AutomodelFrameworkModeCharacteristics
12+
private import AutomodelEndpointTypes
1313

14-
from Endpoint endpoint, EndpointCharacteristic characteristic, float confidence, string message
14+
from
15+
Endpoint endpoint, EndpointCharacteristic characteristic, float confidence, string message,
16+
MetadataExtractor meta, string package, string type, boolean subtypes, string name,
17+
string signature, int input
1518
where
1619
characteristic.appliesToEndpoint(endpoint) and
1720
confidence >= SharedCharacteristics::highConfidence() and
1821
characteristic.hasImplications(any(NegativeSinkType negative), true, confidence) and
1922
// Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly
2023
// certain about in the prompt.
2124
not erroneousEndpoints(endpoint, _, _, _, _, false) and
25+
meta.hasMetadata(endpoint, package, type, subtypes, name, signature, input) and
2226
// It's valid for a node to satisfy the logic for both `isSink` and `isSanitizer`, but in that case it will be
2327
// treated by the actual query as a sanitizer, since the final logic is something like
2428
// `isSink(n) and not isSanitizer(n)`. We don't want to include such nodes as negative examples in the prompt, because
@@ -29,11 +33,11 @@ where
2933
confidence2 >= SharedCharacteristics::maximalConfidence() and
3034
characteristic2.hasImplications(positiveType, true, confidence2)
3135
) and
32-
message =
33-
characteristic + "\n" +
34-
// Extract the needed metadata for this endpoint.
35-
any(string metadata | CharacteristicsImpl::hasMetadata(endpoint, metadata))
36-
select endpoint, message + "\nrelated locations: $@, $@.",
36+
message = characteristic
37+
select endpoint,
38+
message + "\nrelated locations: $@, $@." + "\nmetadata: $@, $@, $@, $@, $@, $@.", //
3739
CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, "Callable-JavaDoc"),
38-
"Callable-JavaDoc", //
39-
CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, "Class-JavaDoc"), "Class-JavaDoc" //
40+
"Callable-JavaDoc", CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, "Class-JavaDoc"),
41+
"Class-JavaDoc", //
42+
package, "package", type, "type", subtypes.toString(), "subtypes", name, "name", signature,
43+
"signature", input.toString(), "input" //

java/ql/src/Telemetry/AutomodelExtractPositiveExamples.ql

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8,25 +8,23 @@
88
* @tags internal automodel extract examples positive
99
*/
1010

11-
private import java
12-
private import semmle.code.java.security.ExternalAPIs as ExternalAPIs
13-
private import AutomodelEndpointCharacteristics
11+
private import AutomodelFrameworkModeCharacteristics
1412
private import AutomodelEndpointTypes
1513

16-
from Endpoint sink, SinkType sinkType, string message
14+
from
15+
Endpoint endpoint, SinkType sinkType, MetadataExtractor meta, string package, string type,
16+
boolean subtypes, string name, string signature, int input
1717
where
1818
// Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly
1919
// certain about in the prompt.
20-
not erroneousEndpoints(sink, _, _, _, _, false) and
20+
not erroneousEndpoints(endpoint, _, _, _, _, false) and
21+
meta.hasMetadata(endpoint, package, type, subtypes, name, signature, input) and
2122
// Extract positive examples of sinks belonging to the existing ATM query configurations.
22-
(
23-
CharacteristicsImpl::isKnownSink(sink, sinkType) and
24-
message =
25-
sinkType + "\n" +
26-
// Extract the needed metadata for this endpoint.
27-
any(string metadata | CharacteristicsImpl::hasMetadata(sink, metadata))
28-
)
29-
select sink, message + "\nrelated locations: $@, $@.",
30-
CharacteristicsImpl::getRelatedLocationOrCandidate(sink, "Callable-JavaDoc"),
31-
"Callable-JavaDoc", //
32-
CharacteristicsImpl::getRelatedLocationOrCandidate(sink, "Class-JavaDoc"), "Class-JavaDoc" //
23+
CharacteristicsImpl::isKnownSink(endpoint, sinkType)
24+
select endpoint,
25+
sinkType + "\nrelated locations: $@, $@." + "\nmetadata: $@, $@, $@, $@, $@, $@.", //
26+
CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, "Callable-JavaDoc"),
27+
"Callable-JavaDoc", CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, "Class-JavaDoc"),
28+
"Class-JavaDoc", //
29+
package, "package", type, "type", subtypes.toString(), "subtypes", name, "name", signature,
30+
"signature", input.toString(), "input" //

java/ql/src/Telemetry/AutomodelFrameworkModeCharacteristics.qll

Lines changed: 41 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,22 @@ private import semmle.code.java.dataflow.internal.ModelExclusions as ModelExclus
1717
import AutomodelSharedCharacteristics as SharedCharacteristics
1818
import AutomodelEndpointTypes as AutomodelEndpointTypes
1919

20+
Callable getCallable(DataFlow::ParameterNode e) { result = e.getEnclosingCallable() }
21+
22+
/**
23+
* A meta data extractor. Any Java extraction mode needs to implement exactly
24+
* one instance of this class.
25+
*/
26+
abstract class MetadataExtractor extends string {
27+
bindingset[this]
28+
MetadataExtractor() { any() }
29+
30+
abstract predicate hasMetadata(
31+
DataFlow::ParameterNode e, string package, string type, boolean subtypes, string name,
32+
string signature, int input
33+
);
34+
}
35+
2036
module FrameworkCandidatesImpl implements SharedCharacteristics::CandidateSig {
2137
class Endpoint = DataFlow::ParameterNode;
2238

@@ -87,26 +103,6 @@ module FrameworkCandidatesImpl implements SharedCharacteristics::CandidateSig {
87103
exists(int paramIdx | e.isParameterOf(_, paramIdx) | input = "Argument[" + paramIdx + "]")
88104
}
89105

90-
predicate hasMetadata(Endpoint e, string metadata) {
91-
exists(
92-
string package, string type, boolean subtypes, string name, string signature, int input,
93-
boolean isPublic, boolean isFinal, boolean isStatic
94-
|
95-
hasMetadata(e, package, type, name, signature, input, isFinal, isStatic, isPublic) and
96-
(if isFinal = true or isStatic = true then subtypes = false else subtypes = true) and
97-
metadata =
98-
"{" //
99-
+ "'Package': '" + package //
100-
+ "', 'Type': '" + type //
101-
+ "', 'Subtypes': " + subtypes //
102-
+ ", 'Name': '" + name //
103-
+ ", 'ParamName': '" + e.toString() //
104-
+ "', 'Signature': '" + signature //
105-
+ "', 'Argument index': " + input //
106-
+ "'}" // TODO: Why are the curly braces added twice?
107-
)
108-
}
109-
110106
RelatedLocation getRelatedLocation(Endpoint e, string name) {
111107
name = "Callable-JavaDoc" and
112108
result = getCallable(e).(Documentable).getJavadoc()
@@ -116,8 +112,6 @@ module FrameworkCandidatesImpl implements SharedCharacteristics::CandidateSig {
116112
}
117113
}
118114

119-
Callable getCallable(Endpoint e) { result = e.getEnclosingCallable() }
120-
121115
module CharacteristicsImpl = SharedCharacteristics::SharedCharacteristics<FrameworkCandidatesImpl>;
122116

123117
class EndpointCharacteristic = CharacteristicsImpl::EndpointCharacteristic;
@@ -129,32 +123,32 @@ class Endpoint = FrameworkCandidatesImpl::Endpoint;
129123
*/
130124

131125
/**
132-
* Holds if `n` has the given metadata.
133-
*
134-
* This is a helper function to extract and export needed information about each endpoint.
126+
* A MetadataExtractor that extracts metadata for framework mode.
135127
*/
136-
predicate hasMetadata(
137-
Endpoint n, string package, string type, string name, string signature, int input,
138-
boolean isFinal, boolean isStatic, boolean isPublic
139-
) {
140-
exists(Callable callable |
141-
n.asParameter() = callable.getParameter(input) and
142-
package = callable.getDeclaringType().getPackage().getName() and
143-
type = callable.getDeclaringType().getErasure().(RefType).nestedName() and
144-
(
145-
if callable.isStatic() or callable.getDeclaringType().isStatic()
146-
then isStatic = true
147-
else isStatic = false
148-
) and
149-
(
150-
if callable.isFinal() or callable.getDeclaringType().isFinal()
151-
then isFinal = true
152-
else isFinal = false
153-
) and
154-
name = callable.getSourceDeclaration().getName() and
155-
signature = ExternalFlow::paramsString(callable) and // TODO: Why are brackets being escaped (`\[\]` vs `[]`)?
156-
(if callable.isPublic() then isPublic = true else isPublic = false)
157-
)
128+
class FrameworkModeMetadataExtractor extends MetadataExtractor {
129+
FrameworkModeMetadataExtractor() { this = "FrameworkModeMetadataExtractor" }
130+
131+
override predicate hasMetadata(
132+
Endpoint e, string package, string type, boolean subtypes, string name, string signature,
133+
int input
134+
) {
135+
exists(Callable callable |
136+
e.asParameter() = callable.getParameter(input) and
137+
package = callable.getDeclaringType().getPackage().getName() and
138+
type = callable.getDeclaringType().getErasure().(RefType).nestedName() and
139+
(
140+
if
141+
callable.isStatic() or
142+
callable.getDeclaringType().isStatic() or
143+
callable.isFinal() or
144+
callable.getDeclaringType().isFinal()
145+
then subtypes = true
146+
else subtypes = false
147+
) and
148+
name = e.toString() and
149+
signature = ExternalFlow::paramsString(callable)
150+
)
151+
}
158152
}
159153

160154
/*

java/ql/src/Telemetry/AutomodelSharedCharacteristics.qll

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -55,19 +55,6 @@ signature module CandidateSig {
5555
*/
5656
predicate isNeutral(Endpoint e);
5757

58-
/**
59-
* Holds if `e` has the given metadata.
60-
*
61-
* This is a helper function to extract and export needed information about each endpoint in the sink candidate query
62-
* as well as the queries that extract positive and negative examples for the prompt / training set. The metadata is
63-
* extracted as a string in the format of a Python dictionary, eg.:
64-
*
65-
* `{'Package': 'com.foo.util', 'Type': 'HelperClass', ... }`.
66-
*
67-
* The meta data will be passed on to the machine learning code by the extraction queries.
68-
*/
69-
predicate hasMetadata(Endpoint e, string metadata);
70-
7158
RelatedLocation getRelatedLocation(Endpoint e, string name);
7259
}
7360

@@ -107,8 +94,6 @@ module SharedCharacteristics<CandidateSig Candidate> {
10794
not exists(getAReasonSinkExcluded(candidateSink, sinkType))
10895
}
10996

110-
predicate hasMetadata = Candidate::hasMetadata/2;
111-
11297
/**
11398
* If it exists, gets a related location for a given endpoint or candidate.
11499
* If it doesn't exist, returns the candidate itself as a 'null' value.

0 commit comments

Comments
 (0)