Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions include/phasar/PhasarLLVM/Utils/LLVMIRToSrc.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,19 @@ void to_json(nlohmann::json &J, const SourceCodeInfo &Info);
[[nodiscard]] std::optional<DebugLocation>
getDebugLocation(const llvm::Value *V);

/// Call \p Callback(DILocalVariable *, Value *) for every dbg.declare record
/// attached to \p I. Handles both the legacy intrinsic form and the
/// DbgVariableRecord form introduced in LLVM 19.
void forEachDbgDeclare(
const llvm::Instruction &I,
llvm::function_ref<void(const llvm::DILocalVariable *, const llvm::Value *)>
Callback);

/// Strip DW_TAG_typedef chains from \p Ty and return the underlying
/// DICompositeType if it is a DW_TAG_structure_type; return null otherwise.
[[nodiscard]] const llvm::DICompositeType *
stripTypedefsToStruct(const llvm::DIType *Ty);

} // namespace psr

#endif
133 changes: 66 additions & 67 deletions lib/PhasarLLVM/TaintConfig/LLVMTaintConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,17 @@
#include "phasar/PhasarLLVM/TaintConfig/TaintConfigBase.h"
#include "phasar/PhasarLLVM/TaintConfig/TaintConfigData.h"
#include "phasar/PhasarLLVM/Utils/Annotation.h"
#include "phasar/PhasarLLVM/Utils/LLVMIRToSrc.h"
#include "phasar/PhasarLLVM/Utils/LLVMShorthands.h"
#include "phasar/Utils/EnumFlags.h"
#include "phasar/Utils/Logger.h"

#include "llvm/ADT/SmallVector.h"
#include "llvm/BinaryFormat/Dwarf.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Metadata.h"

#include <string>
Expand Down Expand Up @@ -127,64 +128,61 @@ void LLVMTaintConfig::addAllFunctions(const LLVMProjectIRDB &IRDB,

LLVMTaintConfig::LLVMTaintConfig(const psr::LLVMProjectIRDB &Code,
const TaintConfigData &Config) {
// handle functions
addAllFunctions(Code, Config);

// handle variables
// scope can be a function name or a struct.
std::unordered_map<const llvm::Type *, const std::string> StructConfigMap;
if (Config.Variables.empty()) {
return;
}

// read all struct types from config
llvm::DebugInfoFinder DIF;
const auto *M = Code.getModule();
// Map LLVM struct type → configured field name, built by correlating
// debug-declare records with allocas (avoids IR type naming assumptions).
std::unordered_map<const llvm::Type *, std::string> StructConfigMap;

DIF.processModule(*M);
for (const auto &VarDesc : Config.Variables) {
for (const auto &Ty : DIF.types()) {
if (Ty->getTag() == llvm::dwarf::DW_TAG_structure_type &&
Ty->getName() == VarDesc.Scope) {
// TODO: Below loop looks wrong! Why on earth are we looping over the
// SAME set of types every time?
for (const auto &LlvmStructTy : M->getIdentifiedStructTypes()) {
StructConfigMap.insert(
std::pair<const llvm::Type *, const std::string>(LlvmStructTy,
VarDesc.Name));
}
{
// Pre-build scope → field name for O(1) lookup during the alloca scan.
llvm::StringMap<std::string> StructScopeIndex;
for (const auto &VarDesc : Config.Variables) {
StructScopeIndex.try_emplace(VarDesc.Scope, VarDesc.Name);
}

for (const auto *Fun : Code.getAllFunctions()) {
for (const auto &I : llvm::instructions(Fun)) {
forEachDbgDeclare(I, [&](const llvm::DILocalVariable *LocalVar,
const llvm::Value *Addr) {
const auto *AllocaI = llvm::dyn_cast<llvm::AllocaInst>(Addr);
if (!AllocaI) {
return;
}
const auto *StructTy =
llvm::dyn_cast<llvm::StructType>(AllocaI->getAllocatedType());
if (!StructTy) {
return;
}
const auto *CT = stripTypedefsToStruct(LocalVar->getType());
if (!CT) {
return;
}
if (auto It = StructScopeIndex.find(CT->getName());
It != StructScopeIndex.end()) {
StructConfigMap.try_emplace(StructTy, It->second);
}
});
}
}
}

// add corresponding Allocas or getElementPtr instructions to the taint
// category

const auto AddVariable = [this](const VariableData &VarDesc,
const llvm::Instruction &I) -> bool {
#if LLVM_VERSION_MAJOR > 18
for (const llvm::DbgVariableRecord &DbR :
llvm::filterDbgVars(I.getDbgRecordRange())) {
const auto *Var = DbR.getVariable();
assert(Var != nullptr);
if (Var->getName() == VarDesc.Name && Var->getLine() == VarDesc.Line) {
addTaintCategory(DbR.getAddress(), VarDesc.Cat);
return true;
}
}
#endif

if (const auto *DbgDeclare = llvm::dyn_cast<llvm::DbgDeclareInst>(&I)) {
const llvm::DILocalVariable *LocalVar = DbgDeclare->getVariable();
// matching line number with for Allocas
if (LocalVar->getName() == VarDesc.Name &&
LocalVar->getLine() == VarDesc.Line) {
addTaintCategory(DbgDeclare->getAddress(), VarDesc.Cat);
return true;
const llvm::Instruction &I) {
forEachDbgDeclare(I, [&](const llvm::DILocalVariable *Var,
const llvm::Value *Addr) {
if (Var->getLine() == VarDesc.Line && Var->getName() == VarDesc.Name) {
addTaintCategory(Addr, VarDesc.Cat);
}
}
return false;
});
};

for (const auto &VarDesc : Config.Variables) {
for (const auto &Fun : Code.getAllFunctions()) {
for (const auto *Fun : Code.getAllFunctions()) {
const bool MatchingFunScope = [&] {
if (VarDesc.Scope == Fun->getName()) {
return true;
Expand All @@ -194,21 +192,22 @@ LLVMTaintConfig::LLVMTaintConfig(const psr::LLVMProjectIRDB &Code,
}();

for (const auto &I : llvm::instructions(Fun)) {
if (MatchingFunScope && AddVariable(VarDesc, I)) {
continue;
if (MatchingFunScope) {
AddVariable(VarDesc, I);
}

if (!StructConfigMap.empty()) {
// Ignorning line numbers for getElementPtr instructions
// Ignoring line numbers for getElementPtr instructions.
// starts_with covers the edge case where the same name appears as
// both a local variable and a struct member
// (JsonConfig/fun_member_02)
if (const auto *Gep = llvm::dyn_cast<llvm::GetElementPtrInst>(&I)) {
const auto *StType =
llvm::dyn_cast<llvm::StructType>(Gep->getSourceElementType());
if (StType && StructConfigMap.contains(StType)) {
auto VarName = StructConfigMap.at(StType);
// using substr to cover the edge case in which same variable
// name is present as a local variable and also as a struct
// member variable. (Ex. JsonConfig/fun_member_02.cpp)
if (Gep->getName().starts_with(VarName)) {
if (StType) {
if (auto It = StructConfigMap.find(StType);
It != StructConfigMap.end() &&
Gep->getName().starts_with(It->second)) {
addTaintCategory(Gep, VarDesc.Cat);
}
}
Expand All @@ -224,10 +223,10 @@ LLVMTaintConfig::LLVMTaintConfig(const psr::LLVMProjectIRDB &AnnotatedCode) {
llvm::SmallVector<const llvm::Function *, 1> VarAnnotations{};
llvm::SmallVector<const llvm::Function *, 1> PtrAnnotations{};
for (const auto *F : AnnotatedCode.getAllFunctions()) {
if (F->getName().starts_with("llvm.var.annotation")) {
if (F->getIntrinsicID() == llvm::Intrinsic::var_annotation) {
VarAnnotations.push_back(F);
}
if (F->getName().starts_with("llvm.ptr.annotation")) {
if (F->getIntrinsicID() == llvm::Intrinsic::ptr_annotation) {
PtrAnnotations.push_back(F);
}
}
Expand Down Expand Up @@ -343,13 +342,13 @@ void LLVMTaintConfig::addTaintCategory(const llvm::Value *Val,
//

bool LLVMTaintConfig::isSourceImpl(const llvm::Value *V) const {
return SourceValues.count(V);
return SourceValues.contains(V);
}
bool LLVMTaintConfig::isSinkImpl(const llvm::Value *V) const {
return SinkValues.count(V);
return SinkValues.contains(V);
}
bool LLVMTaintConfig::isSanitizerImpl(const llvm::Value *V) const {
return SanitizerValues.count(V);
return SanitizerValues.contains(V);
}

void LLVMTaintConfig::forAllGeneratedValuesAtImpl(
Expand All @@ -366,7 +365,7 @@ void LLVMTaintConfig::forAllGeneratedValuesAtImpl(

if (Callee) {
for (const auto &Arg : Callee->args()) {
if (SourceValues.count(&Arg)) {
if (SourceValues.contains(&Arg)) {
Handler(Inst->getOperand(Arg.getArgNo()));
}
}
Expand All @@ -376,13 +375,13 @@ void LLVMTaintConfig::forAllGeneratedValuesAtImpl(
/// If any function is called with a variable that was defined as source, we
/// don't want to re-generate the value.
for (const auto *Op : Inst->operand_values()) {
if (SourceValues.count(Op)) {
if (SourceValues.contains(Op)) {
Handler(Op);
}
}
}

if (SourceValues.count(Inst)) {
if (SourceValues.contains(Inst)) {
Handler(Inst);
}
}
Expand All @@ -401,7 +400,7 @@ void LLVMTaintConfig::forAllLeakCandidatesAtImpl(

if (Callee) {
for (const auto &Arg : Callee->args()) {
if (SinkValues.count(&Arg)) {
if (SinkValues.contains(&Arg)) {
Handler(Inst->getOperand(Arg.getArgNo()));
}
}
Expand All @@ -427,7 +426,7 @@ void LLVMTaintConfig::forAllSanitizedValuesAtImpl(

if (Callee) {
for (const auto &Arg : Callee->args()) {
if (SanitizerValues.count(&Arg)) {
if (SanitizerValues.contains(&Arg)) {
Handler(Inst->getOperand(Arg.getArgNo()));
}
}
Expand All @@ -448,7 +447,7 @@ bool LLVMTaintConfig::generatesValuesAtImpl(
return true;
}

return SourceValues.count(Inst) ||
return SourceValues.contains(Inst) ||
llvm::any_of(Inst->operand_values(), [this](const auto *Op) {
return SourceValues.count(Op);
});
Expand Down
34 changes: 34 additions & 0 deletions lib/PhasarLLVM/Utils/LLVMIRToSrc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/BinaryFormat/Dwarf.h"
#include "llvm/Demangle/Demangle.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DebugInfoMetadata.h"
Expand All @@ -24,6 +25,9 @@
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#if LLVM_VERSION_MAJOR > 18
#include "llvm/IR/DebugProgramInstruction.h"
#endif
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Operator.h"
Expand Down Expand Up @@ -549,3 +553,33 @@ std::optional<DebugLocation> psr::getDebugLocation(const llvm::Value *V) {

return std::nullopt;
}

void psr::forEachDbgDeclare(
const llvm::Instruction &I,
llvm::function_ref<void(const llvm::DILocalVariable *, const llvm::Value *)>
Callback) {
#if LLVM_VERSION_MAJOR > 18
for (const llvm::DbgVariableRecord &DbR :
llvm::filterDbgVars(I.getDbgRecordRange())) {
if (DbR.isDbgDeclare()) {
Callback(DbR.getVariable(), DbR.getAddress());
}
}
#endif
if (const auto *D = llvm::dyn_cast<llvm::DbgDeclareInst>(&I)) {
Callback(D->getVariable(), D->getAddress());
}
}

const llvm::DICompositeType *
psr::stripTypedefsToStruct(const llvm::DIType *Ty) {
while (const auto *D = llvm::dyn_cast_or_null<llvm::DIDerivedType>(Ty)) {
if (D->getTag() != llvm::dwarf::DW_TAG_typedef) {
return nullptr;
}
Ty = D->getBaseType();
}
const auto *CT = llvm::dyn_cast_or_null<llvm::DICompositeType>(Ty);
return (CT && CT->getTag() == llvm::dwarf::DW_TAG_structure_type) ? CT
: nullptr;
}
2 changes: 2 additions & 0 deletions test/llvm_test_code/TaintConfig/JsonConfig/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ set(JsonTaintConfigTestSources
basic_03.c
basic_04.c
data_member_01.cpp
data_member_02.cpp
fun_member_01.cpp
fun_member_02.cpp
name_mangling_01.cpp
Expand All @@ -25,6 +26,7 @@ set(JsonTaintConfigFiles
basic_03_config.json
basic_04_config.json
data_member_01_config.json
data_member_02_config.json
fun_member_01_config.json
fun_member_02_config.json
name_mangling_01_config.json
Expand Down
15 changes: 15 additions & 0 deletions test/llvm_test_code/TaintConfig/JsonConfig/data_member_02.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@

struct Y {
int A = 99;
};

struct X {
int A = 13;
int B = 0;
};

int main() {
X V;
Y W;
return V.A + V.B + W.A;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"name": "taint-config-test",
"version": 1.0,
"variables": [
{
"file": "data_member_02.cpp",
"scope": "X",
"name": "A",
"line": 7,
"cat": "source"
}
]
}
23 changes: 23 additions & 0 deletions unittests/PhasarLLVM/TaintConfig/TaintConfigTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,29 @@ TEST_F(TaintConfigTest, DataMember_01_Json) {
ASSERT_TRUE(TConfig.isSource(I));
}

TEST_F(TaintConfigTest, DataMember_02_Json) {
// Regression test for
// https://github.com/secure-software-engineering/phasar/issues/835 Two
// structs (X and Y) both have a field named "A". Only X::A is configured as a
// source. Verify X::A GEPs are sources and Y::A GEPs are not.
const std::string File = "data_member_02_cpp_dbg.ll";
const std::string Config = "data_member_02_config.json";
auto JsonConfig =
psr::parseTaintConfig(PathToJsonTaintConfigTestCode + Config);
psr::LLVMProjectIRDB IR({PathToJsonTaintConfigTestCode + File});
psr::LLVMTaintConfig TConfig(IR, JsonConfig);

// GEP for X::A in X's default constructor — should be source
const llvm::Value *XFieldA =
testingLocInIR(LineColFun{7, 7, "_ZN1XC2Ev"}, IR);
ASSERT_TRUE(TConfig.isSource(XFieldA));

// GEP for Y::A in Y's default constructor — must NOT be source
const llvm::Value *YFieldA =
testingLocInIR(LineColFun{3, 7, "_ZN1YC2Ev"}, IR);
ASSERT_FALSE(TConfig.isSource(YFieldA));
}

TEST_F(TaintConfigTest, FunMember_01_Json) {
const std::string File = "fun_member_01_cpp_dbg.ll";
const std::string Config = "fun_member_01_config.json";
Expand Down
Loading