Skip to content

Commit 52a8230

Browse files
author
Stephan Brandauer
committed
restructure shared characteristics module; add framework support for sanitizers
1 parent ffe7c62 commit 52a8230

File tree

4 files changed

+69
-44
lines changed

4 files changed

+69
-44
lines changed

java/ql/src/Telemetry/AutomodelEndpointCharacteristics.qll

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ module CandidatesImpl implements SharedCharacteristics::CandidateSig {
2525
t instanceof AutomodelEndpointTypes::NegativeSinkType
2626
}
2727

28+
// Sanitizers are currently not modeled in MaD. TODO: check if this has large negative impact.
29+
predicate isSanitizer(Endpoint e, EndpointType t) { none() }
30+
2831
string getLocationString(Endpoint e) { result = e.getLocation().toString() }
2932

3033
predicate isKnownLabel(string label, string humanReadableLabel, EndpointType type) {

java/ql/src/Telemetry/AutomodelExtractNegativeExamples.ql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
* @kind problem
66
* @severity info
77
* @id java/ml-powered/non-sink
8-
* @tags automodel extract negative-examples
8+
* @tags automodel extract examples negative
99
*/
1010

1111
import AutomodelEndpointCharacteristics

java/ql/src/Telemetry/AutomodelExtractPositiveExamples.ql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
* @kind problem
66
* @severity info
77
* @id java/ml-powered/known-sink
8-
* @tags automodel extract positive-examples
8+
* @tags automodel extract examples positive
99
*/
1010

1111
private import java

java/ql/src/Telemetry/AutomodelSharedCharacteristics.qll

Lines changed: 64 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,19 @@ float highConfidence() { result = 0.9 }
44

55
float mediumConfidence() { result = 0.6 }
66

7+
/**
8+
* A specification of how to instantiate the shared characteristics for a given candidate class.
9+
*
10+
* The `CandidateSig` implementation specifies a type to use for Endpoints (eg., `ParameterNode`), as well as a type
11+
* to label endpoint classes (the `EndpointType`). One of the endpoint classes needs to be a 'negative' class, meaning
12+
* "not any of the other known endpoint types".
13+
*/
714
signature module CandidateSig {
815
class Endpoint;
916

1017
class EndpointType;
1118

19+
/** The string representing the file+range of the endpoint. */
1220
string getLocationString(Endpoint e);
1321

1422
/**
@@ -22,6 +30,11 @@ signature module CandidateSig {
2230
*/
2331
predicate isNegative(EndpointType t);
2432

33+
/**
34+
* Should hold for any endpoint that is a flow sanitizer.
35+
*/
36+
predicate isSanitizer(Endpoint e, EndpointType t);
37+
2538
/**
2639
* Should hold for any endpoint that is a sink of the given (known or unknown) label.
2740
*/
@@ -37,11 +50,23 @@ signature module CandidateSig {
3750
*
3851
* This is a helper function to extract and export needed information about each endpoint in the sink candidate query
3952
* as well as the queries that extract positive and negative examples for the prompt / training set. The metadata is
40-
* extracted as a string in the format of a Python dictionary.
53+
* extracted as a string in the format of a Python dictionary, eg.:
54+
*
55+
* `{'Package': 'com.foo.util', 'Type': 'HelperClass', ... }`.
56+
*
57+
* The meta data will be passed on to the machine learning code by the extraction queries.
4158
*/
4259
predicate hasMetadata(Endpoint e, string metadata);
4360
}
4461

62+
/**
63+
* A set of shared characteristics for a given candidate class.
64+
*
65+
* This module is language-agnostic, although the `CandidateSig` module will be language-specific.
66+
*
67+
* The language specific implementation can also further extend the behaviour of this module by adding additional
68+
* implementations of endpoint characteristics exported by this module.
69+
*/
4570
module SharedCharacteristics<CandidateSig Candidate> {
4671
predicate isNegative(Candidate::EndpointType e) { Candidate::isNegative(e) }
4772

@@ -159,20 +184,6 @@ module SharedCharacteristics<CandidateSig Candidate> {
159184
}
160185
}
161186

162-
/**
163-
* Endpoints identified as sinks by the MaD modeling are sinks with maximal confidence.
164-
*/
165-
private class KnownSinkCharacteristic extends SinkCharacteristic {
166-
string madLabel;
167-
Candidate::EndpointType endpointType;
168-
169-
KnownSinkCharacteristic() { Candidate::isKnownLabel(madLabel, this, endpointType) }
170-
171-
override predicate appliesToEndpoint(Candidate::Endpoint e) { Candidate::isSink(e, madLabel) }
172-
173-
override Candidate::EndpointType getSinkType() { result = endpointType }
174-
}
175-
176187
/**
177188
* A high-confidence characteristic that indicates that an endpoint is not a sink of any type. These endpoints can be
178189
* used as negative samples for training or for a few-shot prompt.
@@ -190,33 +201,6 @@ module SharedCharacteristics<CandidateSig Candidate> {
190201
}
191202
}
192203

193-
/**
194-
* A negative characteristic that indicates that an endpoint is not part of the source code for the project being
195-
* analyzed.
196-
*
197-
* WARNING: These endpoints should not be used as negative samples for training, because they are not necessarily
198-
* non-sinks. They are merely not interesting sinks to run through the ML model.
199-
*/
200-
private class IsExternalCharacteristic extends LikelyNotASinkCharacteristic {
201-
IsExternalCharacteristic() { this = "external" }
202-
203-
override predicate appliesToEndpoint(Candidate::Endpoint e) {
204-
not exists(Candidate::getLocationString(e))
205-
}
206-
}
207-
208-
/**
209-
* A negative characteristic that indicates that an endpoint was manually modeled as a neutral model.
210-
*
211-
* TODO: It may be necessary to turn this into a LikelyNotASinkCharacteristic, pending answers to the definition of a
212-
* neutral model (https://github.com/github/codeql-java-team/issues/254#issuecomment-1435309148).
213-
*/
214-
private class NeutralModelCharacteristic extends NotASinkCharacteristic {
215-
NeutralModelCharacteristic() { this = "known non-sink" }
216-
217-
override predicate appliesToEndpoint(Candidate::Endpoint e) { Candidate::isNeutral(e) }
218-
}
219-
220204
/**
221205
* A medium-confidence characteristic that indicates that an endpoint is unlikely to be a sink of any type. These
222206
* endpoints can be excluded from scoring at inference time, both to save time and to avoid false positives. They should
@@ -256,4 +240,42 @@ module SharedCharacteristics<CandidateSig Candidate> {
256240
confidence = mediumConfidence()
257241
}
258242
}
243+
244+
/**
245+
* Contains default implementations that are derived solely from the `CandidateSig` implementation.
246+
*/
247+
private module DefaultCharacteristicImplementations {
248+
/**
249+
* Endpoints identified as sinks by the `CandidateSig` implementation are sinks with maximal confidence.
250+
*/
251+
private class KnownSinkCharacteristic extends SinkCharacteristic {
252+
string madLabel;
253+
Candidate::EndpointType endpointType;
254+
255+
KnownSinkCharacteristic() { Candidate::isKnownLabel(madLabel, this, endpointType) }
256+
257+
override predicate appliesToEndpoint(Candidate::Endpoint e) { Candidate::isSink(e, madLabel) }
258+
259+
override Candidate::EndpointType getSinkType() { result = endpointType }
260+
}
261+
262+
/**
263+
* A negative characteristic that indicates that an endpoint was manually modeled as a neutral model.
264+
*/
265+
private class NeutralModelCharacteristic extends NotASinkCharacteristic {
266+
NeutralModelCharacteristic() { this = "known non-sink" }
267+
268+
override predicate appliesToEndpoint(Candidate::Endpoint e) { Candidate::isNeutral(e) }
269+
}
270+
271+
/**
272+
* A negative characteristic that indicates that an endpoint is not part of the source code for the project being
273+
* analyzed.
274+
*/
275+
private class IsSanitizerCharacteristic extends NotASinkCharacteristic {
276+
IsSanitizerCharacteristic() { this = "external" }
277+
278+
override predicate appliesToEndpoint(Candidate::Endpoint e) { Candidate::isSanitizer(e, _) }
279+
}
280+
}
259281
}

0 commit comments

Comments
 (0)