@@ -14,7 +14,10 @@ private import AutomodelEndpointTypes
1414private import AutomodelSharedUtil
1515
1616/**
17- * Gets a sample of endpoints for which the given characteristic applies.
17+ * Gets a sample of endpoints (of at most `limit` samples) for which the given characteristic applies.
18+ *
19+ * The main purpose of this helper predicate is to avoid selecting too many samples, as this may
20+ * cause the SARIF file to exceed the maximum size limit.
1821 */
1922bindingset [ limit]
2023Endpoint getSampleForCharacteristic ( EndpointCharacteristic c , int limit ) {
@@ -28,7 +31,11 @@ Endpoint getSampleForCharacteristic(EndpointCharacteristic c, int limit) {
2831 loc .getFile ( ) .getAbsolutePath ( ) , loc .getStartLine ( ) , loc .getStartColumn ( ) ,
2932 loc .getEndLine ( ) , loc .getEndColumn ( )
3033 ) and
31- // we order the endpoints by location, but (to avoid bias) we select the indices semi-randomly
34+ // To avoid selecting samples that are too close together (as the ranking above goes by file
35+ // path first), we select `limit` evenly spaced samples from the ranked list of endpoints. By
36+ // default this would always include the first sample, so we add a random-chosen prime offset
37+ // to the first sample index, and reduce modulo the number of endpoints.
38+ // Finally, we add 1 to the result, as ranking results in a 1-indexed relation.
3239 n = 1 + ( ( [ 0 .. limit - 1 ] * ( num_endpoints / limit ) .floor ( ) + 46337 ) % num_endpoints )
3340 )
3441}
0 commit comments