diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 00000000000..9e54da52323 Binary files /dev/null and b/.DS_Store differ diff --git a/.gitignore b/.gitignore index b6cb6f7c796..b93f0dcdd2c 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,6 @@ OpenMS.config OpenMS.creator OpenMS.files OpenMS.includes +.DS_Store +src/.DS_Store +src/openms/.DS_Store diff --git a/cmake/knime_package_support.cmake b/cmake/knime_package_support.cmake index bddca635496..82aadb4643d 100644 --- a/cmake/knime_package_support.cmake +++ b/cmake/knime_package_support.cmake @@ -118,6 +118,8 @@ add_custom_target( COMMAND ${CMAKE_COMMAND} -D SCRIPT_DIR=${SCRIPT_DIRECTORY} -DTOOLNAME=MSGFPlusAdapter -DPARAM=executable -D CTD_PATH=${CTD_PATH} -P ${SCRIPT_DIRECTORY}remove_parameter_from_ctd.cmake # LuciPhorAdapter COMMAND ${CMAKE_COMMAND} -D SCRIPT_DIR=${SCRIPT_DIRECTORY} -DTOOLNAME=LuciphorAdapter -DPARAM=executable -D CTD_PATH=${CTD_PATH} -P ${SCRIPT_DIRECTORY}remove_parameter_from_ctd.cmake + # PercolatorAdapter + COMMAND ${CMAKE_COMMAND} -D SCRIPT_DIR=${SCRIPT_DIRECTORY} -DTOOLNAME=PercolatorAdapter -DPARAM=percolator_executable -D CTD_PATH=${CTD_PATH} -P ${SCRIPT_DIRECTORY}remove_parameter_from_ctd.cmake # FidoAdapter COMMAND ${CMAKE_COMMAND} -D SCRIPT_DIR=${SCRIPT_DIRECTORY} -DTOOLNAME=FidoAdapter -DPARAM=fido_executable -D CTD_PATH=${CTD_PATH} -P ${SCRIPT_DIRECTORY}remove_parameter_from_ctd.cmake COMMAND ${CMAKE_COMMAND} -D SCRIPT_DIR=${SCRIPT_DIRECTORY} -DTOOLNAME=FidoAdapter -DPARAM=fidocp_executable -D CTD_PATH=${CTD_PATH} -P ${SCRIPT_DIRECTORY}remove_parameter_from_ctd.cmake @@ -265,6 +267,8 @@ elseif(NOT EXISTS ${SEARCH_ENGINES_DIRECTORY}/Fido) message(FATAL_ERROR "The given search engine directory seems to have an invalid layout (Fido is missing). ${FOLDER_STRUCTURE_MESSAGE}") elseif(NOT EXISTS ${SEARCH_ENGINES_DIRECTORY}/LuciPHOr2) message(FATAL_ERROR "The given search engine directory seems to have an invalid layout (LuciPHOr2 is missing). ${FOLDER_STRUCTURE_MESSAGE}") +elseif(NOT EXISTS ${SEARCH_ENGINES_DIRECTORY}/Percolator) + message(FATAL_ERROR "The given search engine directory seems to have an invalid layout (Percolator is missing). Please check use the one from the SVN.") elseif(NOT APPLE AND NOT EXISTS ${SEARCH_ENGINES_DIRECTORY}/MyriMatch) message(FATAL_ERROR "The given search engine directory seems to have an invalid layout (MyriMatch is missing). ${FOLDER_STRUCTURE_MESSAGE}") endif() diff --git a/doc/doxygen/public/TOPP.doxygen b/doc/doxygen/public/TOPP.doxygen index aa3b9b1380c..5c6429bbe8e 100755 --- a/doc/doxygen/public/TOPP.doxygen +++ b/doc/doxygen/public/TOPP.doxygen @@ -139,6 +139,7 @@ - @subpage TOPP_PeptideIndexer - Refreshes the protein references for all peptide hits. - @subpage TOPP_PhosphoScoring - Scores potential phosphorylation sites in order to localize the most probable sites. - @subpage TOPP_ProteinInference - Infer proteins from a list of (high-confidence) peptides. + - @subpage TOPP_PercolatorAdapter - Applies the percolator algorithm to protein/peptide identifications. Targeted Experiments - @subpage TOPP_InclusionExclusionListCreator - Creates inclusion and/or exclusion lists for LC-MS/MS experiments. diff --git a/doc/doxygen/public/UTILS.doxygen b/doc/doxygen/public/UTILS.doxygen index 1b1c10995f9..7adfc65bcc6 100644 --- a/doc/doxygen/public/UTILS.doxygen +++ b/doc/doxygen/public/UTILS.doxygen @@ -78,6 +78,8 @@ - @subpage UTILS_RNPxl - Tool for RNP cross linking experiment analysis. - @subpage UTILS_SequenceCoverageCalculator - Prints information about idXML files. - @subpage UTILS_SpecLibCreator - Creates an MSP-formatted spectral library. + - @subpage UTILS_PSMFeatureExtractor - Creates search engine specific features for PercolatorAdapter input. + Quantitation - @subpage UTILS_ERPairFinder - Evaluate pair ratios on enhanced resolution (zoom) scans. diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 00000000000..4bd2efc1187 Binary files /dev/null and b/src/.DS_Store differ diff --git a/src/openms/.DS_Store b/src/openms/.DS_Store new file mode 100644 index 00000000000..f11a0d0eb96 Binary files /dev/null and b/src/openms/.DS_Store differ diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/PercolatorFeatureSetHelper.h b/src/openms/include/OpenMS/ANALYSIS/ID/PercolatorFeatureSetHelper.h new file mode 100644 index 00000000000..f680b252e11 --- /dev/null +++ b/src/openms/include/OpenMS/ANALYSIS/ID/PercolatorFeatureSetHelper.h @@ -0,0 +1,203 @@ +// -------------------------------------------------------------------------- +// OpenMS -- Open-Source Mass Spectrometry +// -------------------------------------------------------------------------- +// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, +// ETH Zurich, and Freie Universitaet Berlin 2002-2015. +// +// This software is released under a three-clause BSD license: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of any author or any participating institution +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// For a full list of authors, refer to the file AUTHORS. +// -------------------------------------------------------------------------- +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING +// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// -------------------------------------------------------------------------- +// $Maintainer: Mathias Walzer $ +// $Authors: Mathias Walzer, Matthew The $ +// -------------------------------------------------------------------------- + +#ifndef OPENMS_ANALYSIS_ID_TOPPERC_H +#define OPENMS_ANALYSIS_ID_TOPPERC_H + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +namespace OpenMS +{ + /** + @brief Percolator feature set and integration helper + + This class contains functions to handle (compute, aggregate, integrate) + Percolator features. This includes the calculation or extraction of + Percolator features for the specific search engine usage, preparation for + PercolatorApater usage and result reintegration and in the case of + multiple search engine incorporation of different features. + */ + + class OPENMS_DLLAPI PercolatorFeatureSetHelper + { + + public: + /** + * @brief concatMULTISEPeptideIds + * @param all_peptide_ids PeptideIdentification vector to append to + * @param new_peptide_ids PeptideIdentification vector to be appended + * @param search_engine search engine to depend on for feature creation + * + * Appends a vector of PeptideIdentification to another and registers concatenation Percolator features depending on given search engine. + */ + static void concatMULTISEPeptideIds(std::vector& all_peptide_ids, std::vector& new_peptide_ids, String search_engine); + + /** + * @brief mergeMULTISEPeptideIds + * @param all_peptide_ids PeptideIdentification vector to be merged into + * @param new_peptide_ids PeptideIdentification vector to merge + * @param search_engine search engine to create features from their scores + * + * Merges a vector of PeptideIdentification into another and registers merge Percolator features depending on given search engine. + */ + static void mergeMULTISEPeptideIds(std::vector& all_peptide_ids, std::vector& new_peptide_ids, String search_engine); + + /** + * @brief mergeMULTISEProteinIds + * @param all_protein_ids ProteinIdentification vector to be merged into + * @param new_protein_ids ProteinIdentification vector to merge + * + * Concatenates SearchParameter of multiple search engine runs and merges PeptideEvidences, registers the created Percolator features + */ + static void mergeMULTISEProteinIds(std::vector& all_protein_ids, std::vector& new_protein_ids); + + + /** + * @brief addMSGFFeatures + * @param peptide_ids PeptideIdentification vector to create Percolator features in + * @param feature_set register of added features + * + * Creates and adds MSGF+ specific Percolator features and registers them in feature_set + */ + static void addMSGFFeatures(std::vector& peptide_ids, StringList& feature_set); + + /** + * @brief addXTANDEMFeatures + * @param peptide_ids PeptideIdentification vector to create Percolator featrues in + * @param feature_set register of added features + * + * Creates and adds X!Tandem specific Percolator features and registers them in feature_set + */ + static void addXTANDEMFeatures(std::vector& peptide_ids, StringList& feature_set); + + /** + * @brief addCOMETFeatures + * @param peptide_ids PeptideIdentification vector to create Percolator featrues in + * @param feature_set register of added features + * + * Creates and adds Comet specific Percolator features and registers them in feature_set + */ + static void addCOMETFeatures(std::vector& peptide_ids, StringList& feature_set); + + /** + * @brief addMASCOTFeatures + * @param peptide_ids PeptideIdentification vector to create Percolator featrues in + * @param feature_set register of added features + * + * Creates and adds Mascot specific Percolator features and registers them in feature_set + */ + static void addMASCOTFeatures(std::vector& peptide_ids, StringList& feature_set); + + /** + * @brief addMULTISEFeatures + * @param peptide_ids PeptideIdentification vector to create Percolator featrues in + * @param search_engines_used the list of search engines to be considered + * @param feature_set register of added features + * @param complete_only will only add features for PeptideIdentifications where all given search engines identified something + * @param limits_imputation + * + * Adds multiple search engine specific Percolator features and registers them in feature_set + */ + static void addMULTISEFeatures(std::vector& peptide_ids, StringList& search_engines_used, StringList& feature_set, bool complete_only = true, bool limits_imputation = false); + + /** + * @brief addCONCATSEFeatures + * @param peptide_id_list PeptideIdentification vector to create Percolator featrues in + * @param search_engines_used the list of search engines to be considered + * @param feature_set register of added features + * + * Adds multiple search engine specific Percolator features and registers them in feature_set + */ + static void addCONCATSEFeatures(std::vector& peptide_id_list, StringList& search_engines_used, StringList& feature_set); + + /** + * @brief checkExtraFeatures + * @param psms the vector of PeptideHit to be checked + * @param extra_features the list of requested extra features + * + * checks and removes requested extra Percolator features that are actually unavailable (to compute) + */ + static void checkExtraFeatures(const std::vector &psms, StringList& extra_features); + + + protected: + PercolatorFeatureSetHelper(); + virtual ~PercolatorFeatureSetHelper(); + + /// Rescales the fragment features to penalize features calculated by few ions, adapted from MSGFtoPercolator + static double rescaleFragmentFeature_(double featureValue, int NumMatchedMainIons); + + /// helper functin for assigning the frequently occurring feature delta score + static void assignDeltaScore_(std::vector& hits, String score_ref, String output_ref); + + /// gets the scan identifer to merge by + static String getScanMergeKey_(std::vector::iterator it, std::vector::iterator start); + + /// For accession dependent sorting of ProteinHits + struct lq_ProteinHit + { + inline bool operator() (const ProteinHit& h1, const ProteinHit& h2) + { + return (h1.getAccession() < h2.getAccession()); + } + }; + + /// For accession dependent sorting of PeptideEvidences + struct lq_PeptideEvidence + { + inline bool operator() (const PeptideEvidence& h1, const PeptideEvidence& h2) + { + return (h1.getProteinAccession() < h2.getProteinAccession()); + } + }; + + }; + +} //namespace OpenMS + +#endif //OPENMS_ANALYSIS_ID_PERCOLATORFEATURESETHELPER_H + diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake b/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake index d17134f8bb3..178844e4e54 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake +++ b/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake @@ -23,6 +23,7 @@ MetaboliteSpectralMatching.h PeptideProteinResolution.h ProtonDistributionModel.h PeptideIndexing.h +PercolatorFeatureSetHelper.h ) ### add path to the filenames diff --git a/src/openms/include/OpenMS/FORMAT/FileTypes.h b/src/openms/include/OpenMS/FORMAT/FileTypes.h index f1cd7a2eb64..1fea13472b5 100644 --- a/src/openms/include/OpenMS/FORMAT/FileTypes.h +++ b/src/openms/include/OpenMS/FORMAT/FileTypes.h @@ -102,6 +102,7 @@ namespace OpenMS MRM, ///< SpectraST MRM List PSMS, ///< Percolator tab-delimited output (PSM level) PARAMXML, ///< internal format for writing and reading parameters (also used as part of CTD) + PIN, ///< Percolator tab-delimited input (PSM level) SIZE_OF_TYPE ///< No file type. Simply stores the number of types }; diff --git a/src/openms/source/ANALYSIS/ID/PercolatorFeatureSetHelper.cpp b/src/openms/source/ANALYSIS/ID/PercolatorFeatureSetHelper.cpp new file mode 100644 index 00000000000..659b335c36b --- /dev/null +++ b/src/openms/source/ANALYSIS/ID/PercolatorFeatureSetHelper.cpp @@ -0,0 +1,799 @@ +// -------------------------------------------------------------------------- +// OpenMS -- Open-Source Mass Spectrometry +// -------------------------------------------------------------------------- +// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, +// ETH Zurich, and Freie Universitaet Berlin 2002-2015. +// +// This software is released under a three-clause BSD license: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of any author or any participating institution +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// For a full list of authors, refer to the file AUTHORS. +// -------------------------------------------------------------------------- +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING +// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// -------------------------------------------------------------------------- +// $Maintainer: Mathias Walzer $ +// $Authors: Mathias Walzer, Matthew The $ +// -------------------------------------------------------------------------- + +#include +#include + +using namespace std; + +namespace OpenMS +{ + void PercolatorFeatureSetHelper::addMSGFFeatures(vector& peptide_ids, StringList& feature_set) + { + feature_set.push_back("MS:1002049"); // unchanged RawScore + feature_set.push_back("MS:1002050"); // unchanged DeNovoScore + feature_set.push_back("MSGF:ScoreRatio"); + feature_set.push_back("MSGF:Energy"); + feature_set.push_back("MSGF:lnEValue"); + feature_set.push_back("IsotopeError"); // unchanged IsotopeError + feature_set.push_back("MSGF:lnExplainedIonCurrentRatio"); + feature_set.push_back("MSGF:lnNTermIonCurrentRatio"); + feature_set.push_back("MSGF:lnCTermIonCurrentRatio"); + feature_set.push_back("MSGF:lnMS2IonCurrent"); + feature_set.push_back("MSGF:MeanErrorTop7"); + feature_set.push_back("MSGF:sqMeanErrorTop7"); + feature_set.push_back("MSGF:StdevErrorTop7"); + + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + { + for (vector::iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) + { + // Some Hits have no NumMatchedMainIons, and MeanError, etc. values. Have to ignore them! + if (hit->metaValueExists("NumMatchedMainIons")) + { + // only take features from first ranked entries and only with meanerrortop7 != 0.0 + if (hit->getMetaValue("MeanErrorTop7").toString().toDouble() != 0.0) + { + double raw_score = hit->getMetaValue("MS:1002049").toString().toDouble(); + double denovo_score = hit->getMetaValue("MS:1002050").toString().toDouble(); + + double energy = denovo_score - raw_score; + double score_ratio = raw_score * 10000; + if (denovo_score > 0) + { + score_ratio = (raw_score / denovo_score); + } + hit->setMetaValue("MSGF:ScoreRatio", score_ratio); + hit->setMetaValue("MSGF:Energy", energy); + + double ln_eval = -log(hit->getMetaValue("MS:1002053").toString().toDouble()); + hit->setMetaValue("MSGF:lnEValue", ln_eval); + + double ln_explained_ion_current_ratio = log(hit->getMetaValue("ExplainedIonCurrentRatio").toString().toDouble() + 0.0001); // @andsi: wtf?! + double ln_NTerm_ion_current_ratio = log(hit->getMetaValue("NTermIonCurrentRatio").toString().toDouble() + 0.0001); // @andsi: wtf?! + double ln_CTerm_ion_current_ratio = log(hit->getMetaValue("CTermIonCurrentRatio").toString().toDouble() + 0.0001); // @andsi: wtf?! + hit->setMetaValue("MSGF:lnExplainedIonCurrentRatio", ln_explained_ion_current_ratio); + hit->setMetaValue("MSGF:lnNTermIonCurrentRatio", ln_NTerm_ion_current_ratio); + hit->setMetaValue("MSGF:lnCTermIonCurrentRatio", ln_CTerm_ion_current_ratio); + + double ln_MS2_ion_current = log(hit->getMetaValue("MS2IonCurrent").toString().toDouble()); + hit->setMetaValue("MSGF:lnMS2IonCurrent", ln_MS2_ion_current); + + double mean_error_top7 = hit->getMetaValue("MeanErrorTop7").toString().toDouble(); + int num_matched_main_ions = hit->getMetaValue("NumMatchedMainIons").toString().toInt(); + + double stdev_error_top7 = 0.0; + if (hit->getMetaValue("StdevErrorTop7").toString() != "NaN") + { + stdev_error_top7 = hit->getMetaValue("StdevErrorTop7").toString().toDouble(); + if (stdev_error_top7 == 0.0) + { + stdev_error_top7 = mean_error_top7; + } + } + else + { + stdev_error_top7 = mean_error_top7; + LOG_WARN << "StdevErrorTop7 is NaN, setting as MeanErrorTop7 instead." << endl; + } + + mean_error_top7 = rescaleFragmentFeature_(mean_error_top7, num_matched_main_ions); + double sq_mean_error_top7 = rescaleFragmentFeature_(mean_error_top7 * mean_error_top7, num_matched_main_ions); + stdev_error_top7 = rescaleFragmentFeature_(stdev_error_top7, num_matched_main_ions); + hit->setMetaValue("MSGF:MeanErrorTop7", mean_error_top7); + hit->setMetaValue("MSGF:sqMeanErrorTop7", sq_mean_error_top7); + hit->setMetaValue("MSGF:StdevErrorTop7", stdev_error_top7); + } + } + else LOG_WARN << "MS-GF+ PSM with missing NumMatchedMainIons skipped." << endl; + } + } + } + + void PercolatorFeatureSetHelper::addXTANDEMFeatures(vector& peptide_ids, StringList& feature_set) + { + // Find out which ions are in XTandem-File and take only these as features + StringList ion_types = ListUtils::create("a,b,c,x,y,z"); + StringList ion_types_found; + for (StringList::const_iterator ion = ion_types.begin(); ion != ion_types.end(); ++ion) + { + if (peptide_ids.front().getHits().front().getMetaValue(*ion + "_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue(*ion + "_ions").toString() != "") + { + feature_set.push_back("XTANDEM:frac_ion_" + *ion); + ion_types_found.push_back(*ion); + } + } + feature_set.push_back("XTANDEM:hyperscore"); + feature_set.push_back("XTANDEM:deltascore"); + + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + { + double hyper_score = it->getHits().front().getScore(); + double delta_score = hyper_score - it->getHits().front().getMetaValue("nextscore").toString().toDouble(); + it->getHits().front().setMetaValue("XTANDEM:hyperscore", hyper_score); + it->getHits().front().setMetaValue("XTANDEM:deltascore", delta_score); + + String sequence = it->getHits().front().getSequence().toUnmodifiedString(); + int length = sequence.length(); + + // Find out correct ion types and get its Values + for (StringList::const_iterator ion = ion_types_found.begin(); ion != ion_types_found.end(); ++ion) + { + if (peptide_ids.front().getHits().front().getMetaValue(*ion + "_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue(*ion + "_ions").toString() != "") + { + // recalculate ion score + double ion_score = it->getHits().front().getMetaValue(*ion + "_ions").toString().toDouble() / length; + it->getHits().front().setMetaValue("XTANDEM:frac_ion_" + *ion, ion_score); + } + } + } + } + + void PercolatorFeatureSetHelper::addCOMETFeatures(vector& peptide_ids, StringList& feature_set) + { + feature_set.push_back("COMET:deltCn"); // recalculated deltCn = (current_XCorr - 2nd_best_XCorr) / max(current_XCorr, 1) + feature_set.push_back("COMET:deltLCn"); // deltLCn = (current_XCorr - worst_XCorr) / max(current_XCorr, 1) + feature_set.push_back("COMET:lnExpect"); // log(E-value) + feature_set.push_back("MS:1002252"); // unchanged XCorr + feature_set.push_back("MS:1002255"); // unchanged Sp = number of candidate peptides + feature_set.push_back("COMET:lnNumSP"); // log(number of candidate peptides) + feature_set.push_back("COMET:lnRankSP"); // log(rank based on Sp score) + feature_set.push_back("COMET:IonFrac"); // matched_ions / total_ions + + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + { + double worst_xcorr = 0, second_xcorr = 0; + Int cnt = 0; + for (vector::iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) + { + double xcorr = hit->getMetaValue("MS:1002252").toString().toDouble(); + worst_xcorr = xcorr; + if (cnt == 1) second_xcorr = xcorr; + ++cnt; + } + + for (vector::iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) + { + double xcorr = hit->getMetaValue("MS:1002252").toString().toDouble(); + double delta_cn = (xcorr - second_xcorr) / max(1.0, xcorr); + double delta_last_cn = (xcorr - worst_xcorr) / max(1.0, xcorr); + hit->setMetaValue("COMET:deltCn", delta_cn); + hit->setMetaValue("COMET:deltLCn", delta_last_cn); + + double ln_expect = log(hit->getMetaValue("MS:1002257").toString().toDouble()); + hit->setMetaValue("COMET:lnExpect", ln_expect); + + double ln_num_sp; + if (hit->metaValueExists("num_matched_peptides")) + { + double num_sp = hit->getMetaValue("num_matched_peptides").toString().toDouble(); + ln_num_sp = log(max(1.0, num_sp)); // if recorded, one can be safely assumed + } + else // fallback + { + ln_num_sp = hit->getMetaValue("MS:1002255").toString().toDouble(); + } + double ln_rank_sp = log(max(1.0, hit->getMetaValue("MS:1002256").toString().toDouble())); + hit->setMetaValue("COMET:lnNumSP", ln_num_sp); + hit->setMetaValue("COMET:lnRankSP", ln_rank_sp); + + double num_matched_ions = hit->getMetaValue("MS:1002258").toString().toDouble(); + double num_total_ions = hit->getMetaValue("MS:1002259").toString().toDouble(); + double ion_frac = num_matched_ions / num_total_ions; + hit->setMetaValue("COMET:IonFrac", ion_frac); + } + } + } + + /** + Features 1-9 Represent the Basic Feature Set + + feature abbreviation feature description + 1. mass Calculated monoisotopic mass of the identified peptide. Present as generic feature. + 2. charge Precursor ion charge. Present as generic feature. + 3. mScore Mascot score. Added in this function. + 4. dScore Mascot score minus Mascot score of next best nonisobaric peptide hit. Added in this function. + 5. deltaM Calculated minus observed peptide mass (in Dalton and ppm). Present as generic feature. + 6. absDeltaM Absolute value of calculated minus observed peptide mass (in Dalton and ppm). Present as generic feature. + 7. isoDeltaM Calculated minus observed peptide mass, isotope error corrected (in Dalton and ppm) + 8. uniquePeps None (0), one (1), two or more (2) distinct peptide sequences match same protein. Added in this function. + 9. mc Missed tryptic cleavages. Present as generic feature. + + Features 10-18 Represent the Extended Feature Set As Used in Mascot Percolator + + feature abbreviation feature description + 10. totInt Total ion intensity (log). Not available in mascot adapter. + 11. intMatchedTot Total matched ion intensity (log). Not available in mascot adapter. + 12. relIntMatchedTot Total matched ion intensity divided by total ion intensity. Not available in mascot adapter. + 13. binom Peptide Score as described in ref 28. Not available in mascot adapter. + 14. fragMassError Mean fragment mass error (in Dalton and ppm). Not available in mascot adapter. + 15. absFragMassError Mean absolute fragment mass error (in Dalton and ppm). Not available in mascot adapter. + 16. fracIonsMatched Fraction of calculated ions matched (per ion series). Not available in mascot adapter. + 17. seqCov Sequence coverage of matched ions (per ion series). Not available in mascot adapter. + 18. intMatched Matched ion intensity (per ion series). Not available in mascot adapter. + */ + void PercolatorFeatureSetHelper::addMASCOTFeatures(vector& peptide_ids, StringList& feature_set) + { + feature_set.push_back("MS:1001171"); // unchanged mScore + feature_set.push_back("MASCOT:delta_score"); // delta score based on mScore + feature_set.push_back("MASCOT:uniqueToProt"); // bool: peptide unique to protein + feature_set.push_back("MASCOT:hasMod"); // bool: has post translational modification + + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + { + it->sort(); + it->assignRanks(); + std::vector hits = it->getHits(); + assignDeltaScore_(hits, "MS:1001171", "MASCOT:delta_score"); + + for (vector::iterator hit = hits.begin(); hit != hits.end(); ++hit) + { + bool unique_to_protein = (String(hit->getMetaValue("protein_references")) == "unique"); + bool has_mod = hit->getSequence().isModified(); + hit->setMetaValue("MASCOT:uniqueToProt", unique_to_protein); + hit->setMetaValue("MASCOT:hasMod", has_mod); + } + } + } + + void PercolatorFeatureSetHelper::addCONCATSEFeatures(vector& peptide_ids, StringList& search_engines_used, StringList& feature_set) + { + for (StringList::iterator it = search_engines_used.begin(); it != search_engines_used.end(); ++it) { + feature_set.push_back("CONCAT:" + *it); + } + LOG_INFO << "Using " << ListUtils::concatenate(search_engines_used, ", ") << " as source for search engine specific features." << endl; + feature_set.push_back("CONCAT:lnEvalue"); + feature_set.push_back("CONCAT:deltaLnEvalue"); + + // feature values have been set in concatMULTISEids + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + { + it->sort(); + it->assignRanks(); + assignDeltaScore_(it->getHits(), "CONCAT:lnEvalue", "CONCAT:deltaLnEvalue"); + } + } + + void PercolatorFeatureSetHelper::mergeMULTISEPeptideIds(vector& all_peptide_ids, vector& new_peptide_ids, String search_engine) + { + LOG_DEBUG << "creating spectrum map" << endl; + + std::map unified; + //setup map of merge characteristics per spectrum + for (vector::iterator pit = all_peptide_ids.begin(); pit != all_peptide_ids.end(); ++pit) + { + PeptideIdentification ins = *pit; + ins.setScoreType("multiple"); + ins.setIdentifier("TopPerc_multiple_SE_input"); + String spectrum_reference = getScanMergeKey_(pit, all_peptide_ids.begin()); + unified[spectrum_reference] = ins; + } + LOG_DEBUG << "filled spectrum map with previously observed PSM: " << unified.size() << endl; + + int nc = 0; + int mc = 0; + LOG_DEBUG << "About to merge in:" << new_peptide_ids.size() << "PSMs." << endl; + for (vector::iterator pit = new_peptide_ids.begin(); pit != new_peptide_ids.end(); ++pit) + { + PeptideIdentification ins = *pit; + string st = pit->getScoreType(); + //prepare for merge + for (vector::iterator hit = ins.getHits().begin(); hit != ins.getHits().end(); ++hit) + { + // keep the hit score as meta value + if (st == "MS-GF:RawScore") + st = "MS:1002049"; + else if (st == "XTandem") + st = "MS:1001331"; + else if (st == "Mascot") + st = "MS:1001171"; + else if ((st == "expect" && search_engine == "Comet" )|| st == "Comet") + st = "MS:1002257"; + + if (!hit->metaValueExists(st)) + hit->setMetaValue(st, hit->getScore()); + + hit->setScore(1); // new 'multiple' score is just the number of times identified by different SE + + // rename ambiguous meta value names to PSI cv term ids + if (search_engine == "MS-GF+") // MS-GF should have all values as PSI cv terms available anyway + { + if (hit->metaValueExists("EValue")) + hit->setMetaValue("MS:1002053", hit->getMetaValue("EValue")); + } + if (search_engine == "Mascot") + { + if (hit->metaValueExists("EValue")) + hit->setMetaValue("MS:1001172", hit->getMetaValue("EValue")); + } + if (search_engine == "Comet") + { + if (hit->metaValueExists("xcorr")) + hit->setMetaValue("MS:1002252", hit->getMetaValue("xcorr")); + } + if (search_engine == "XTandem") + { + if (hit->metaValueExists("E-Value")) + hit->setMetaValue("MS:1001330", hit->getMetaValue("E-Value")); + } + + } + ins.setScoreType("multiple"); + ins.setIdentifier("TopPerc_multiple_SE_input"); + String spectrum_reference = getScanMergeKey_(pit, new_peptide_ids.begin()); + //merge in unified map + // insert newly identified spectra (PeptideIdentifications) or .. + if (unified.find(spectrum_reference) == unified.end()) + { + unified[spectrum_reference] = ins; + ++nc; + } + // .. add PSMs to already identified spectrum + else + { + //find corresponding hit (i.e. sequences must match) + for (vector::iterator hit = ins.getHits().begin(); hit != ins.getHits().end(); ++hit) + { + for (vector::iterator merger = unified[spectrum_reference].getHits().begin(); merger != unified[spectrum_reference].getHits().end(); ++merger) + { + if (hit->getSequence()==merger->getSequence()) + { + //care for peptide evidences!! set would be okay if checked for same search db in parameters, +// vector pev; +// pev.reserve(max(hit->getPeptideEvidences().size(),merger->getPeptideEvidences().size())); +// std::vector::iterator uni; +// std::sort(merger->getPeptideEvidences().begin(),merger->getPeptideEvidences().end(), TopPerc::lq_PeptideEvidence); +// std::sort(hit->getPeptideEvidences().begin(),hit->getPeptideEvidences().end(), TopPerc::lq_PeptideEvidence); +// uni = std::set_union(swop.front().getHits().begin(), swop.front().getHits().end(), +// it->front().getHits().begin(),it->front().getHits().end(), pev.begin(), +// TopPerc::lq_PeptideEvidence); +// pev.resize(uni-pev.begin()); +// merger->setPeptideEvidences(pev); + //There is no mutable getPeptideEvidences() accessor in PeptideHit - above will not werk, but so long: + //Implying PeptideIndexer was applied (with the same search db each) will care for that all PeptideEvidences from two hits with equal AASequence are the same + + //merge meta values + StringList keys; + hit->getKeys(keys); + for (StringList::const_iterator kt = keys.begin(); kt != keys.end(); ++kt) + { + if (!merger->metaValueExists(*kt)) + { + merger->setMetaValue(*kt, hit->getMetaValue(*kt)); + } + } + + // adds up the number of hits, as the score of each separate (new) hit is 1 + merger->setScore(merger->getScore() + hit->getScore()); + ++mc; + break; + } + } + } + } + } + + LOG_DEBUG << "filled spectrum map" << endl; + std::vector swip; + swip.reserve(unified.size()); + LOG_DEBUG << "merging spectrum map" << endl; + for (std::map::iterator it = unified.begin(); it != unified.end(); ++it) + { + swip.push_back(it->second); + } + all_peptide_ids.swap(swip); + LOG_DEBUG << "Now containing " << all_peptide_ids.size() << " spectra identifications, new: " << nc << " merged in: " << mc << endl; + } + + // references from PeptideHits to ProteinHits work with the protein accessions, so no need to update the PeptideHits + void PercolatorFeatureSetHelper::mergeMULTISEProteinIds(vector& all_protein_ids, vector& new_protein_ids) + { + LOG_DEBUG << "merging search parameters" << endl; + + String SE = new_protein_ids.front().getSearchEngine(); + if (all_protein_ids.empty()) + { + all_protein_ids.push_back(ProteinIdentification()); + DateTime now = DateTime::now(); + String date_string = now.getDate(); + String identifier = "TopPerc_" + date_string; + all_protein_ids.front().setDateTime(now); + all_protein_ids.front().setIdentifier(identifier); + all_protein_ids.front().setSearchEngine(SE); + LOG_DEBUG << "Setting search engine to " << SE << endl; + all_protein_ids.front().setSearchParameters(new_protein_ids.front().getSearchParameters()); + } + else if (all_protein_ids.front().getSearchEngine() != SE) + { + all_protein_ids.front().setSearchEngine("multiple"); + } + std::vector& all_protein_hits = all_protein_ids.front().getHits(); + std::vector& new_protein_hits = new_protein_ids.front().getHits(); + + LOG_DEBUG << "Sorting " << new_protein_hits.size() << " new ProteinHits." << endl; + std::sort(new_protein_hits.begin(), new_protein_hits.end(), PercolatorFeatureSetHelper::lq_ProteinHit()); + + LOG_DEBUG << "Melting with " << all_protein_hits.size() << " previous ProteinHits." << endl; + if (all_protein_hits.empty()) + { + all_protein_hits.swap(new_protein_hits); + } + else + { + std::vector tmp_protein_hits(new_protein_hits.size() + all_protein_hits.size()); + std::vector::iterator uni = set_union( + all_protein_hits.begin(), all_protein_hits.end(), + new_protein_hits.begin(), new_protein_hits.end(), tmp_protein_hits.begin(), + PercolatorFeatureSetHelper::lq_ProteinHit() ); + tmp_protein_hits.resize(uni - tmp_protein_hits.begin()); + all_protein_hits.swap(tmp_protein_hits); + } + LOG_DEBUG << "Done with next ProteinHits." << endl; + + StringList keys; + all_protein_ids.front().getSearchParameters().getKeys(keys); + if (!ListUtils::contains(keys, "SE:" + SE)) + { + LOG_DEBUG << "Melting Parameters from " << SE << " into MetaInfo." << endl; + + //insert into MetaInfo as SE:param + ProteinIdentification::SearchParameters sp = new_protein_ids.front().getSearchParameters(); + ProteinIdentification::SearchParameters all_sp = all_protein_ids.front().getSearchParameters(); + all_sp.setMetaValue("SE:"+SE,new_protein_ids.front().getSearchEngineVersion()); + all_sp.setMetaValue(SE+":db",sp.db); + all_sp.setMetaValue(SE+":db_version",sp.db_version); + all_sp.setMetaValue(SE+":taxonomy",sp.taxonomy); + all_sp.setMetaValue(SE+":charges",sp.charges); + all_sp.setMetaValue(SE+":fixed_modifications",ListUtils::concatenate(sp.fixed_modifications, ",")); + all_sp.setMetaValue(SE+":variable_modifications",ListUtils::concatenate(sp.variable_modifications, ",")); + all_sp.setMetaValue(SE+":missed_cleavages",sp.missed_cleavages); + all_sp.setMetaValue(SE+":fragment_mass_tolerance",sp.fragment_mass_tolerance); + all_sp.setMetaValue(SE+":fragment_mass_tolerance_ppm",sp.fragment_mass_tolerance_ppm); + all_sp.setMetaValue(SE+":precursor_mass_tolerance",sp.precursor_mass_tolerance); + all_sp.setMetaValue(SE+":precursor_mass_tolerance_ppm",sp.precursor_mass_tolerance_ppm); + all_sp.setMetaValue(SE+":digestion_enzyme",sp.digestion_enzyme.getName()); + + LOG_DEBUG << "Done with next Parameters." << endl; + all_protein_ids.front().setSearchParameters(all_sp); + } + LOG_DEBUG << "Merging primaryMSRunPaths." << endl; + try + { + StringList all_primary_ms_run_path = all_protein_ids.front().getPrimaryMSRunPath(); + StringList new_primary_ms_run_path = new_protein_ids.front().getPrimaryMSRunPath(); + + all_primary_ms_run_path.insert(all_primary_ms_run_path.end(), new_primary_ms_run_path.begin(), new_primary_ms_run_path.end()); + all_protein_ids.front().setPrimaryMSRunPath(all_primary_ms_run_path); + + LOG_DEBUG << "New primary run paths: " << ListUtils::concatenate(new_primary_ms_run_path,",") << endl; + LOG_DEBUG << "All primary run paths: " << ListUtils::concatenate(all_primary_ms_run_path,",") << endl; + } + catch (Exception::BaseException& e) + { + LOG_DEBUG << "faulty primary MS run path: " << endl; + Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, e.what(), ""); + } + LOG_DEBUG << "Merging for this file finished." << endl; + } + + void PercolatorFeatureSetHelper::concatMULTISEPeptideIds(vector& all_peptide_ids, vector& new_peptide_ids, String search_engine) + { + for (vector::iterator pit = new_peptide_ids.begin(); pit != new_peptide_ids.end(); ++pit) + { + for (vector::iterator hit = pit->getHits().begin(); hit != pit->getHits().end(); ++hit) + { + double evalue = 1000.0; + if (search_engine == "MS-GF+") + { + hit->setMetaValue("CONCAT:" + search_engine, hit->getMetaValue("MS:1002049")); // rawscore + evalue = hit->getMetaValue("MS:1002049").toString().toDouble(); // evalue + } + if (search_engine == "Mascot") + { + hit->setMetaValue("CONCAT:" + search_engine, hit->getMetaValue("MS:1001171")); // mscore + evalue = hit->getMetaValue("EValue").toString().toDouble(); + } + if (search_engine == "Comet") + { + hit->setMetaValue("CONCAT:" + search_engine, hit->getMetaValue("MS:1002252")); // xcorr + evalue = hit->getMetaValue("MS:1002257").toString().toDouble(); + } + if (search_engine == "XTandem") + { + hit->setMetaValue("CONCAT:" + search_engine, hit->getMetaValue("XTandem_score")); // xtandem score + evalue = hit->getMetaValue("E-Value").toString().toDouble(); + } + hit->setMetaValue("CONCAT:lnEvalue", log(evalue)); // log(evalue) + } + } + all_peptide_ids.insert(all_peptide_ids.end(), new_peptide_ids.begin(), new_peptide_ids.end()); + } + + void PercolatorFeatureSetHelper::addMULTISEFeatures(vector& peptide_ids, StringList& search_engines_used, StringList& feature_set, bool complete_only, bool limits_imputation) + { + map > extremals; // will have as keys the below SE cv terms + vector max_better, min_better; + // This is the minimum set for each SE that should be available in all openms id files in one way or another + if (ListUtils::contains(search_engines_used, "MS-GF+")) + { + feature_set.push_back("MS:1002049"); // rawscore + feature_set.push_back("MS:1002053"); // evalue + max_better.push_back("MS:1002049"); // higher is better - start high, get min + min_better.push_back("MS:1002053"); // lower is better - start low, get max + } + if (ListUtils::contains(search_engines_used, "Mascot")) + { + feature_set.push_back("MS:1001171"); // score aka Mascot + feature_set.push_back("MS:1001172"); // evalue aka EValue + max_better.push_back("MS:1001171"); // higher is better - start high, get min + min_better.push_back("MS:1001172"); // lower is better - start low, get max + } + if (ListUtils::contains(search_engines_used, "Comet")) + { + feature_set.push_back("MS:1002252"); // xcorr + feature_set.push_back("MS:1002257"); // evalue + max_better.push_back("MS:1002252"); // higher is better - start high, get min + min_better.push_back("MS:1002257"); // lower is better - start low, get max + } + if (ListUtils::contains(search_engines_used, "XTandem")) + { + feature_set.push_back("MS:1001331"); // hyperscore aka XTandem + feature_set.push_back("MS:1001330"); // evalue aka E-Value + max_better.push_back("MS:1001331"); // higher is better - start high, get min + min_better.push_back("MS:1001330"); // lower is better - start low, get max + } + //feature_set.push_back("MULTI:ionFrac"); + //feature_set.push_back("MULTI:numHits"); // this is not informative if we only keep PSMs with hits for all search engines + + LOG_INFO << "Using " << ListUtils::concatenate(search_engines_used, ", ") << " as source for search engine specific features." << endl; + + // get all the feature values + if (!complete_only) + { + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + { + for (vector::iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) + { + for (StringList::iterator feats = feature_set.begin(); feats != feature_set.end(); ++feats) + { + if (hit->metaValueExists(*feats)) + { + // TODO raise issue: MS-GF raw score values are sometimes registered as string DataValues and henceforth casted defectively + if (hit->getMetaValue(*feats).valueType() == DataValue::STRING_VALUE) + { + string recast = hit->getMetaValue(*feats); + double d = boost::lexical_cast(recast); + LOG_DEBUG << "recast: " + << recast << " " + << double(hit->getMetaValue(*feats)) << "* "; + hit->setMetaValue(*feats,d); + LOG_DEBUG << hit->getMetaValue(*feats).valueType() << " " + << hit->getMetaValue(*feats) + << endl; + } + extremals[*feats].push_back(hit->getMetaValue(*feats)); + } + } + } + } + // TODO : add optional manual extremal values settings for 'data imputation' instead of min/max or numeric_limits value + for (vector::iterator maxbt = max_better.begin(); maxbt != max_better.end(); ++maxbt) + { + map >::iterator fi = extremals.find(*maxbt); + if (fi != extremals.end()) + { + vector::iterator mymax = min_element(fi->second.begin(), fi->second.end()); + iter_swap(fi->second.begin(), mymax); + if (limits_imputation) + { + fi->second.front() = -std::numeric_limits::max(); + } + } + } + for (vector::iterator minbt = min_better.begin(); minbt != min_better.end(); ++minbt) + { + map >::iterator fi = extremals.find(*minbt); + if (fi != extremals.end()) + { + vector::iterator mymin = max_element(fi->second.begin(), fi->second.end()); + iter_swap(fi->second.begin(), mymin); + if (limits_imputation) + { + fi->second.front() = std::numeric_limits::max(); + } + } + } + } + + size_t sum_removed = 0; + size_t imputed_values = 0; + size_t observed_values = 0; + size_t complete_spectra = 0; + size_t incomplete_spectra = 0; + + LOG_DEBUG << "Looking for minimum feature set:" << ListUtils::concatenate(feature_set, ", ") << "." << endl; + + for (vector::iterator pi = peptide_ids.begin(); pi != peptide_ids.end(); ++pi) + { + pi->sort(); + pi->assignRanks(); + vector::iterator> incompletes; + + size_t imputed_back = imputed_values; + for (vector::iterator hit = pi->getHits().begin(); hit != pi->getHits().end(); ++hit) + { + //double ion_frac = hit->getMetaValue("matched_intensity").toString().toDouble() / hit->getMetaValue("sum_intensity").toString().toDouble(); // also consider "matched_ion_number"/"peak_number" + //hit->setMetaValue("MULTI:ionFrac", ion_frac); + + for (StringList::iterator feats = feature_set.begin(); feats != feature_set.end(); ++feats) + { + if (complete_only && !hit->metaValueExists(*feats)) + { + incompletes.push_back(hit); // mark for removal + break; + } + else if (!hit->metaValueExists(*feats)) + { + hit->setMetaValue(*feats, extremals[*feats].front()); // imputation + ++imputed_values; + } + else + { + ++observed_values; + } + } + int num_hits = hit->getScore(); + hit->setMetaValue("MULTI:numHits", num_hits); + } + if (complete_only) + { + // remove incompletes + for (vector::iterator>::reverse_iterator rit = incompletes.rbegin(); rit != incompletes.rend(); ++rit) + { + pi->getHits().erase(*rit); + } + sum_removed += incompletes.size(); + } + if (incompletes.size() > 0 || imputed_back < imputed_values) + ++incomplete_spectra; + else + ++complete_spectra; + } + if (sum_removed > 0) + { + LOG_WARN << "Removed " << sum_removed << " incomplete cases of PSMs." << endl; + } + if (imputed_values > 0) + { + LOG_WARN << "Imputed " << imputed_values << " of " << observed_values+imputed_values + << " missing values. (" + << imputed_values*100.0/(imputed_values+observed_values) + << "%)" << endl; + LOG_WARN << "Affected " << incomplete_spectra << " of " << incomplete_spectra+complete_spectra + << " spectra. (" + << incomplete_spectra*100.0/(incomplete_spectra+complete_spectra) + << "%)" << endl; + } + } + + void PercolatorFeatureSetHelper::checkExtraFeatures(const vector& psms, StringList& extra_features) + { + set unavail; + for (vector::const_iterator hit = psms.begin(); hit != psms.end(); ++hit) + { + for (StringList::iterator ef = extra_features.begin(); ef != extra_features.end(); ++ef) + { + if (!hit->metaValueExists(*ef)) + { + unavail.insert(ef); + } + } + } + for (set::reverse_iterator rit = unavail.rbegin(); rit != unavail.rend(); ++rit) + { + LOG_WARN << "A extra_feature requested (" << *(*rit) << ") was not available - removed." << endl; + extra_features.erase(*rit); + } + } + + + // Function adapted from MsgfplusReader in Percolator converter + double PercolatorFeatureSetHelper::rescaleFragmentFeature_(double featureValue, int NumMatchedMainIons) + { + // Rescale the fragment features to penalize features calculated by few ions + int numMatchedIonLimit = 7; + int numerator = (1 + numMatchedIonLimit) * (1 + numMatchedIonLimit); + int denominator = (1 + (min)(NumMatchedMainIons, numMatchedIonLimit)) * (1 + (min)(NumMatchedMainIons, numMatchedIonLimit)); + return featureValue * ((double)numerator / denominator); + } + + void PercolatorFeatureSetHelper::assignDeltaScore_(vector& hits, String score_ref, String output_ref) + { + if (!hits.empty()) + { + vector::iterator prev = hits.begin(); + double prev_score = double(prev->getMetaValue(score_ref)); + for (vector::iterator hit = hits.begin()+1; hit != hits.end(); ++hit) + { + double cur_score = double(hit->getMetaValue(score_ref)); + double value = prev_score - cur_score; + prev->setMetaValue(output_ref, value); + prev = hit; + } + (hits.end()-1)->setMetaValue(output_ref, 0.0); //if last hit or only one hit + } + } + + // TODO: this is code redundancy to PercolatorAdapter + String PercolatorFeatureSetHelper::getScanMergeKey_(vector::iterator it, vector::iterator start) + { + // MSGF+ uses this field, is empty if not specified + String scan_identifier = it->getMetaValue("spectrum_reference"); + if (scan_identifier.empty()) + { + // XTandem uses this (integer) field + // these ids are 1-based in contrast to the index which is 0-based, so subtract 1. + if (it->metaValueExists("spectrum_id") && !it->getMetaValue("spectrum_id").toString().empty()) + { + scan_identifier = "index=" + String(it->getMetaValue("spectrum_id").toString().toInt() - 1); + } + else + { + scan_identifier = "index=" + String(it - start + 1); + LOG_WARN << "no known spectrum identifiers, using index [1,n] - use at own risk." << endl; + } + } + + Int scan = 0; + StringList fields = ListUtils::create(scan_identifier); + for (StringList::const_iterator it = fields.begin(); it != fields.end(); ++it) + { + Size idx = 0; + if ((idx = it->find("scan=")) != string::npos) + { + scan = it->substr(idx + 5).toInt(); + break; + } // only if scan number is not available, use the scan index + else if ((idx = it->find("index=")) != string::npos) + { + scan = it->substr(idx + 6).toInt(); + } + } + return String(scan); + } + + +} diff --git a/src/openms/source/ANALYSIS/ID/sources.cmake b/src/openms/source/ANALYSIS/ID/sources.cmake index 341aefedcef..defacb14ccc 100644 --- a/src/openms/source/ANALYSIS/ID/sources.cmake +++ b/src/openms/source/ANALYSIS/ID/sources.cmake @@ -23,6 +23,7 @@ MetaboliteSpectralMatching.cpp PeptideProteinResolution.cpp ProtonDistributionModel.cpp PeptideIndexing.cpp +PercolatorFeatureSetHelper.cpp ) ### add path to the filenames diff --git a/src/openms/source/APPLICATIONS/ToolHandler.cpp b/src/openms/source/APPLICATIONS/ToolHandler.cpp index d7c814fd1c6..9daff9e26dc 100755 --- a/src/openms/source/APPLICATIONS/ToolHandler.cpp +++ b/src/openms/source/APPLICATIONS/ToolHandler.cpp @@ -56,6 +56,7 @@ namespace OpenMS tools_map["ConsensusMapNormalizer"] = Internal::ToolDescription("ConsensusMapNormalizer", "Map Alignment"); tools_map["ConvertTraMLToTSV"] = Internal::ToolDescription("ConvertTraMLToTSV", "Targeted Experiments"); tools_map["ConvertTSVToTraML"] = Internal::ToolDescription("ConvertTSVToTraML", "Targeted Experiments"); + tools_map["COMETAdapter"] = Internal::ToolDescription("COMETAdapter", "Identification"); tools_map["Decharger"] = Internal::ToolDescription("Decharger", "Quantitation"); tools_map["DTAExtractor"] = Internal::ToolDescription("DTAExtractor", "File Handling"); tools_map["EICExtractor"] = Internal::ToolDescription("EICExtractor", "Quantitation"); @@ -123,6 +124,7 @@ namespace OpenMS tools_map["PeakPickerWavelet"] = Internal::ToolDescription("PeakPickerWavelet", "Signal processing and preprocessing"); tools_map["PepNovoAdapter"] = Internal::ToolDescription("PepNovoAdapter", "Identification"); tools_map["PeptideIndexer"] = Internal::ToolDescription("PeptideIndexer", "ID Processing"); + tools_map["PercolatorAdapter"] = Internal::ToolDescription("PercolatorAdapter", "ID Processing"); tools_map["PhosphoScoring"] = Internal::ToolDescription("PhosphoScoring", "ID Processing"); tools_map["PrecursorIonSelector"] = Internal::ToolDescription("PrecursorIonSelector", "Targeted Experiments"); tools_map["PrecursorMassCorrector"] = Internal::ToolDescription("PrecursorMassCorrector", "Signal processing and preprocessing"); @@ -218,6 +220,7 @@ namespace OpenMS util_map["OpenSwathWorkflow"] = Internal::ToolDescription("OpenSwathWorkflow", util_category); util_map["PeakPickerIterative"] = Internal::ToolDescription("PeakPickerIterative", "Signal processing and preprocessing"); //util_map["PeakPickerRapid"] = Internal::ToolDescription("PeakPickerRapid", "Signal processing and preprocessing"); + util_map["PSMFeatureExtractor"] = Internal::ToolDescription("PSMFeatureExtractor", util_category); util_map["QCCalculator"] = Internal::ToolDescription("QCCalculator", util_category); util_map["QCEmbedder"] = Internal::ToolDescription("QCEmbedder", util_category); util_map["QCExtractor"] = Internal::ToolDescription("QCExtractor", util_category); @@ -237,7 +240,6 @@ namespace OpenMS util_map["SvmTheoreticalSpectrumGeneratorTrainer"] = Internal::ToolDescription("SvmTheoreticalSpectrumGeneratorTrainer", util_category); util_map["TICCalculator"] = Internal::ToolDescription("TICCalculator", util_category); util_map["TransformationEvaluation"] = Internal::ToolDescription("TransformationEvaluation", util_category); - util_map["TopPerc"] = Internal::ToolDescription("TopPerc", util_category); util_map["XMLValidator"] = Internal::ToolDescription("XMLValidator", util_category); // STOP! insert your tool in alphabetical order for easier maintenance (only tools requiring the GUI lib should be added below) diff --git a/src/openms/source/FORMAT/FileTypes.cpp b/src/openms/source/FORMAT/FileTypes.cpp index 0c19526fec0..413083cfd5e 100644 --- a/src/openms/source/FORMAT/FileTypes.cpp +++ b/src/openms/source/FORMAT/FileTypes.cpp @@ -131,7 +131,7 @@ namespace OpenMS targetMap[FileTypes::MRM] = "mrm"; targetMap[FileTypes::PSMS] = "psms"; targetMap[FileTypes::PARAMXML] = "paramXML"; - + targetMap[FileTypes::PIN] = "pin"; return targetMap; } diff --git a/src/openms/source/FORMAT/HANDLERS/MzIdentMLDOMHandler.cpp b/src/openms/source/FORMAT/HANDLERS/MzIdentMLDOMHandler.cpp index 621ac3cd299..b7b6ae0a65f 100644 --- a/src/openms/source/FORMAT/HANDLERS/MzIdentMLDOMHandler.cpp +++ b/src/openms/source/FORMAT/HANDLERS/MzIdentMLDOMHandler.cpp @@ -2032,7 +2032,7 @@ namespace OpenMS hit.setMetaValue(up->first, up->second); } hit.setMetaValue("calcMZ", calculatedMassToCharge); - spectrum_identification.setMZ(experimentalMassToCharge); // TODO @ mths: why is this not in SpectrumIdentificationResult? exp. m/z for one spec should not change from one id for it to the next! + spectrum_identification.setMZ(experimentalMassToCharge); // TODO @ mths for next PSI meeting: why is this not in SpectrumIdentificationResult in the schema? exp. m/z for one spec should not change from one id for it to the next! hit.setMetaValue("pass_threshold", pass); //TODO @ mths do not write metavalue pass_threshold //connect the PeptideHit with PeptideEvidences (for AABefore/After) and subsequently with DBSequence (for ProteinAccession) diff --git a/src/openms/source/FORMAT/PepXMLFile.cpp b/src/openms/source/FORMAT/PepXMLFile.cpp index 26e6bf8aa3d..7f56760548b 100755 --- a/src/openms/source/FORMAT/PepXMLFile.cpp +++ b/src/openms/source/FORMAT/PepXMLFile.cpp @@ -767,6 +767,7 @@ namespace OpenMS if (!exp_name_.empty()) { String base_name = attributeAsString_(attributes, "base_name"); + cout << "base name:" << base_name << " exp_name_:" << exp_name_ << endl; if (!base_name.empty()) { wrong_experiment_ = !base_name.hasSuffix(exp_name_); diff --git a/src/tests/class_tests/openms/data/combined.concat.perco.in.idXML b/src/tests/class_tests/openms/data/combined.concat.perco.in.idXML new file mode 100644 index 00000000000..f90b5876d9d --- /dev/null +++ b/src/tests/class_tests/openms/data/combined.concat.perco.in.idXML @@ -0,0 +1,826 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/tests/class_tests/openms/data/combined.merge.perco.in.idXML b/src/tests/class_tests/openms/data/combined.merge.perco.in.idXML new file mode 100644 index 00000000000..77f5d42d17c --- /dev/null +++ b/src/tests/class_tests/openms/data/combined.merge.perco.in.idXML @@ -0,0 +1,179 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/tests/class_tests/openms/data/comet.topperc.idXML b/src/tests/class_tests/openms/data/comet.topperc.idXML new file mode 100644 index 00000000000..9a0309ea6e3 --- /dev/null +++ b/src/tests/class_tests/openms/data/comet.topperc.idXML @@ -0,0 +1,242 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/tests/class_tests/openms/data/comet.topperc_check.idXML b/src/tests/class_tests/openms/data/comet.topperc_check.idXML new file mode 100644 index 00000000000..0c39a50317d --- /dev/null +++ b/src/tests/class_tests/openms/data/comet.topperc_check.idXML @@ -0,0 +1,347 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/tests/class_tests/openms/data/msgf.topperc.idXML b/src/tests/class_tests/openms/data/msgf.topperc.idXML new file mode 100644 index 00000000000..b559403539a --- /dev/null +++ b/src/tests/class_tests/openms/data/msgf.topperc.idXML @@ -0,0 +1,483 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/tests/class_tests/openms/data/msgf.topperc_check.idXML b/src/tests/class_tests/openms/data/msgf.topperc_check.idXML new file mode 100644 index 00000000000..4be5561260e --- /dev/null +++ b/src/tests/class_tests/openms/data/msgf.topperc_check.idXML @@ -0,0 +1,638 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/tests/class_tests/openms/data/xtandem.topperc.idXML b/src/tests/class_tests/openms/data/xtandem.topperc.idXML new file mode 100644 index 00000000000..f04289e6eca --- /dev/null +++ b/src/tests/class_tests/openms/data/xtandem.topperc.idXML @@ -0,0 +1,77 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/tests/class_tests/openms/data/xtandem.topperc_check.idXML b/src/tests/class_tests/openms/data/xtandem.topperc_check.idXML new file mode 100644 index 00000000000..5fdc6b11146 --- /dev/null +++ b/src/tests/class_tests/openms/data/xtandem.topperc_check.idXML @@ -0,0 +1,104 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/tests/class_tests/openms/executables.cmake b/src/tests/class_tests/openms/executables.cmake index ae27d2b0b5b..764d455863f 100644 --- a/src/tests/class_tests/openms/executables.cmake +++ b/src/tests/class_tests/openms/executables.cmake @@ -485,6 +485,7 @@ set(analysis_executables_list SimpleSVM_test StablePairFinder_test #TargetedExperimentHelper_test + PercolatorFeatureSetHelper_test TransformationDescription_test TransformationModel_test TransformationModelBSpline_test diff --git a/src/tests/class_tests/openms/source/PercolatorFeatureSetHelper_test.cpp b/src/tests/class_tests/openms/source/PercolatorFeatureSetHelper_test.cpp new file mode 100644 index 00000000000..5cbd0393597 --- /dev/null +++ b/src/tests/class_tests/openms/source/PercolatorFeatureSetHelper_test.cpp @@ -0,0 +1,263 @@ +// -------------------------------------------------------------------------- +// OpenMS -- Open-Source Mass Spectrometry +// -------------------------------------------------------------------------- +// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, +// ETH Zurich, and Freie Universitaet Berlin 2002-2017. +// +// This software is released under a three-clause BSD license: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of any author or any participating institution +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// For a full list of authors, refer to the file AUTHORS. +// -------------------------------------------------------------------------- +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING +// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// -------------------------------------------------------------------------- +// $Maintainer: MATHIAS WALZER$ +// $Authors: MATHIAS WALZER$ +// -------------------------------------------------------------------------- + +#include +#include +#include +#include + +/////////////////////////// +#include +/////////////////////////// + +using namespace OpenMS; +using namespace std; + +bool check_pepids(vector check, vector against) +{ + std::vector upk, upkc; + TEST_EQUAL(check.size(), against.size()) + if (check.size() != against.size()) + return false; + for (size_t i = 0; i < check.size(); ++i) + { + TEST_EQUAL(check[i].getHits().size(), against[i].getHits().size()) + for (size_t j = 0; j < check[i].getHits().size(); ++j) + { + check [i].getHits()[j].getKeys(upkc); + against[i].getHits()[j].getKeys(upk); + TEST_EQUAL(upkc.size(), upk.size()) + if (upkc.size() != upk.size()) + return false; + for (size_t k = 0; k < upk.size(); ++k) + TEST_STRING_EQUAL(upkc[k],upk[k]) + } + } + return true; +} + +bool check_proids(vector check, vector against, vector fs) +{ + TEST_EQUAL(check.size(), against.size()) + if (check.size()!= against.size()) + return false; + for (size_t i = 0; i < check.size(); ++i) + TEST_EQUAL(check[i].getHits().size(), against[i].getHits().size()) + + String efc = check.front().getSearchParameters().getMetaValue("extra_features"); + TEST_STRING_EQUAL(efc, ListUtils::concatenate(fs, ",")) + return true; +} + +START_TEST(TopPerc, "$Id$") + +///////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////// + +STATUS("Preparing test inputs.") + +std::vector< PeptideIdentification > comet_check_pids; +std::vector< PeptideIdentification > msgf_check_pids; +std::vector< PeptideIdentification > xtandem_check_pids; +std::vector< PeptideIdentification > merge_check_pids; +std::vector< PeptideIdentification > concat_check_pids; +std::vector< ProteinIdentification > comet_check_pods; +std::vector< ProteinIdentification > msgf_check_pods; +std::vector< ProteinIdentification > xtandem_check_pods; +std::vector< ProteinIdentification > concat_check_pods; +std::vector< ProteinIdentification > merge_check_pods; + +IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc_check.idXML"), comet_check_pods, comet_check_pids); +IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc_check.idXML"), msgf_check_pods, msgf_check_pids); +IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("xtandem.topperc_check.idXML"), xtandem_check_pods, xtandem_check_pids); +IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("combined.merge.perco.in.idXML"), merge_check_pods, merge_check_pids); +IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("combined.concat.perco.in.idXML"), concat_check_pods, concat_check_pids); + +START_SECTION((static void concatMULTISEPeptideIds(std::vector< PeptideIdentification > &all_peptide_ids, std::vector< PeptideIdentification > &new_peptide_ids, String search_engine))) +{ + StringList fs; + std::vector< PeptideIdentification > comet_pids; + std::vector< ProteinIdentification > comet_pods; + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); + + std::vector< PeptideIdentification > msgf_pids; + std::vector< ProteinIdentification > msgf_pods; + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); + + StringList ses = ListUtils::create("MS-GF+,Comet"); + std::vector< PeptideIdentification > concat_pids; + PercolatorFeatureSetHelper::concatMULTISEPeptideIds(concat_pids, msgf_pids, "MS-GF+"); + PercolatorFeatureSetHelper::concatMULTISEPeptideIds(concat_pids, comet_pids, "Comet"); + PercolatorFeatureSetHelper::addCONCATSEFeatures(concat_pids, ses, fs); + + //check completeness of feature construction + ABORT_IF(!check_pepids(concat_check_pids, concat_pids)); +} +END_SECTION + +START_SECTION((static void mergeMULTISEPeptideIds(std::vector< PeptideIdentification > &all_peptide_ids, std::vector< PeptideIdentification > &new_peptide_ids, String search_engine))) +{ + std::vector< PeptideIdentification > comet_pids; + std::vector< ProteinIdentification > comet_pods; + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); + + std::vector< PeptideIdentification > msgf_pids; + std::vector< ProteinIdentification > msgf_pods; + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); + + std::vector< PeptideIdentification > merge_pids; + StringList ses = ListUtils::create("MS-GF+,Comet"); + PercolatorFeatureSetHelper::mergeMULTISEPeptideIds(merge_pids, msgf_pids, "MS-GF+"); + PercolatorFeatureSetHelper::mergeMULTISEPeptideIds(merge_pids, comet_pids, "Comet"); + StringList empty_extra; + PercolatorFeatureSetHelper::addMULTISEFeatures(merge_pids, ses, empty_extra, true); + TEST_EQUAL(merge_pids.size(),4) + for (size_t i = merge_pids.size()-1; i > 0; --i) + { + PercolatorFeatureSetHelper::checkExtraFeatures(merge_pids[i].getHits(), empty_extra); // also check against empty extra features list and inconsistency removal + merge_pids.erase(merge_pids.begin()+i); //erase to be able to use completeness check function below + } + TEST_EQUAL(merge_pids.size(),1) + //check completeness of feature construction + ABORT_IF(!check_pepids(merge_check_pids, merge_pids)); +} +END_SECTION + +START_SECTION((static void mergeMULTISEProteinIds(std::vector< ProteinIdentification > &all_protein_ids, std::vector< ProteinIdentification > &new_protein_ids))) +{ + StringList fs; + std::vector< PeptideIdentification > comet_pids; + std::vector< ProteinIdentification > comet_pods; + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); + + std::vector< PeptideIdentification > msgf_pids; + std::vector< ProteinIdentification > msgf_pods; + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); + + std::vector< ProteinIdentification > merge_pods; + PercolatorFeatureSetHelper::mergeMULTISEProteinIds(merge_pods, msgf_pods); + PercolatorFeatureSetHelper::mergeMULTISEProteinIds(merge_pods, comet_pods); + + std::vector< PeptideIdentification > merge_pids; + StringList ses = ListUtils::create("MS-GF+,Comet"); + PercolatorFeatureSetHelper::mergeMULTISEPeptideIds(merge_pids, msgf_pids, "MS-GF+"); + PercolatorFeatureSetHelper::mergeMULTISEPeptideIds(merge_pids, comet_pids, "Comet"); + PercolatorFeatureSetHelper::addMULTISEFeatures(merge_pids, ses, fs, true); + + //check completeness of feature construction + ABORT_IF(!check_proids(merge_check_pods, merge_pods, fs)); +} +END_SECTION + +START_SECTION((static void addMSGFFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &feature_set))) +{ + StringList fs; + std::vector< PeptideIdentification > msgf_pids; + std::vector< ProteinIdentification > msgf_pods; + + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); + PercolatorFeatureSetHelper::addMSGFFeatures(msgf_pids,fs); + + //check completeness of feature construction + ABORT_IF(check_pepids(msgf_check_pids, msgf_pids)); + + //check registration of percolator features for adapter + ABORT_IF(!check_proids(msgf_check_pods, msgf_pods, fs)); +} +END_SECTION + +START_SECTION((static void addXTANDEMFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &feature_set))) +{ + StringList fs; + std::vector< PeptideIdentification > xtandem_pids; + std::vector< ProteinIdentification > xtandem_pods; + + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("xtandem.topperc.idXML"), xtandem_pods, xtandem_pids); + PercolatorFeatureSetHelper::addXTANDEMFeatures(xtandem_pids, fs); + + //check completeness of feature construction + ABORT_IF(check_pepids(xtandem_check_pids, xtandem_pids)); + + //check registration of percolator features for adapter + ABORT_IF(check_proids(xtandem_check_pods, xtandem_pods, fs)); +} +END_SECTION + +START_SECTION((static void addCOMETFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &feature_set))) +{ + StringList fs; + std::vector< PeptideIdentification > comet_pids; + std::vector< ProteinIdentification > comet_pods; + + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); + PercolatorFeatureSetHelper::addCOMETFeatures(comet_pids, fs); + + //check completeness of feature construction + ABORT_IF(!check_pepids(comet_check_pids, comet_pids)); + + //check registration of percolator features for adapter + ABORT_IF(!check_proids(comet_check_pods, comet_pods, fs)); +} +END_SECTION + +START_SECTION((static void addMASCOTFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &feature_set))) +{ + NOT_TESTABLE // yet +} +END_SECTION + +START_SECTION((static void addMULTISEFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &search_engines_used, StringList &feature_set, bool complete_only=true, bool limits_imputation=false))) +{ + NOT_TESTABLE // actually tested in combination with mergeMULTISEPeptideIds +} +END_SECTION + +START_SECTION((static void addCONCATSEFeatures(std::vector< PeptideIdentification > &peptide_id_list, StringList &search_engines_used, StringList &feature_set))) +{ + NOT_TESTABLE // actually tested in combination with concatMULTISEPeptideIds +} +END_SECTION + +START_SECTION((static void checkExtraFeatures(const std::vector< PeptideHit > &psms, StringList &extra_features))) +{ + NOT_TESTABLE // actually tested in combination with mergeMULTISEPeptideIds +} +END_SECTION + +///////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////// +END_TEST + + + diff --git a/src/topp/PercolatorAdapter.cpp b/src/topp/PercolatorAdapter.cpp new file mode 100644 index 00000000000..c0fd9f487fd --- /dev/null +++ b/src/topp/PercolatorAdapter.cpp @@ -0,0 +1,1027 @@ +// -------------------------------------------------------------------------- +// OpenMS -- Open-Source Mass Spectrometry +// -------------------------------------------------------------------------- +// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, +// ETH Zurich, and Freie Universitaet Berlin 2002-2015. +// +// This software is released under a three-clause BSD license: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of any author or any participating institution +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// For a full list of authors, refer to the file AUTHORS. +// -------------------------------------------------------------------------- +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING +// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// -------------------------------------------------------------------------- +// $Maintainer: Mathias Walzer $ +// $Authors: Andreas Simon, Mathias Walzer, Matthew The $ +// -------------------------------------------------------------------------- +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +using namespace OpenMS; +using namespace std; + +//------------------------------------------------------------- +//Doxygen docu +//------------------------------------------------------------- + +/** + @page TOPP_PercolatorAdapter PercolatorAdapter + + @brief PercolatorAdapter facilitates the input to, the call of and output integration of Percolator. + Percolator (http://per-colator.com/) is a tool to apply semi-supervised learning for peptide + identification from shotgun proteomics datasets. + + @experimental This tool is work in progress and usage and input requirements might change. + +
+ + + + + + + + + + +
pot. predecessor tools \f$ \longrightarrow \f$ PercolatorAdapter \f$ \longrightarrow \f$ pot. successor tools
@ref UTILS_PSMFeatureExtractor @ref TOPP_IDFilter
+
+

Percolator is search engine sensitive, i.e. it's input features vary, +depending on the search engine. Must be prepared beforehand. If you do not want +to use the specific features, use the generic-feature-set flag. Will incorporate +the score attribute of a PSM, so be sure, the score you want is set as main +score with @ref TOPP_IDScoreSwitcher . Be aware, that you might very well +experience a perfomance loss compared to the search engine specific features.

+ + The command line parameters of this tool are: + @verbinclude TOPP_PercolatorAdapter.cli + INI file documentation of this tool: + @htmlinclude TOPP_PercolatorAdapter.html + + Percolator is written by Lukas Käll (http://per-colator.com/ Copyright Lukas Käll ) +*/ + +// We do not want this class to show up in the docu: +/// @cond TOPPCLASSES + + +class PercolatorAdapter : + public TOPPBase +{ +public: + PercolatorAdapter() : + TOPPBase("PercolatorAdapter", "Facilitate input to Percolator and reintegrate.", false) + { + } + +protected: + struct PercolatorResult + { + String PSMId; + double score; + double qvalue; + double posterior_error_prob; + String peptide; + char preAA; + char postAA; + StringList proteinIds; + + PercolatorResult(const String& pid, const double s, const double q, const String& p, const char pre, const char pos, const StringList& pl): + PSMId (pid), + score (s), + qvalue (q), + peptide (p), + preAA (pre), + postAA (pos), + proteinIds (pl) + { + } + + explicit PercolatorResult(StringList& row): + proteinIds() + { + // peptide sequence + StringList pep; + row[4].split(".", pep); + //TODO test pep size 3 + peptide = pep[1]; + preAA = pep[0]=="-"?'[':pep[0].c_str()[0]; // const char PeptideEvidence::N_TERMINAL_AA = '['; + postAA = pep[2]=="-"?']':pep[2].c_str()[0]; // const char PeptideEvidence::C_TERMINAL_AA = ']'; + // SVM-score + score = row[1].toDouble(); + // q-Value + qvalue = row[2].toDouble(); + // PEP + posterior_error_prob = row[3].toDouble(); + // scannr. as written in preparePIN + PSMId = row[0]; + proteinIds = vector(row.begin()+5,row.end()); + } + + bool operator!=(const PercolatorResult& rhs) const + { + if (PSMId != rhs.PSMId || score != rhs.score || qvalue != rhs.qvalue || + posterior_error_prob != rhs.posterior_error_prob || peptide != rhs.peptide || + proteinIds != rhs.proteinIds) + return true; + return false; + } + + bool operator==(const PercolatorResult& rhs) const + { + return !(operator !=(rhs)); + } + }; + + struct PercolatorProteinResult + { + String protein_accession; + double qvalue; + double posterior_error_prob; + + PercolatorProteinResult(const String& pid, const double q, const double pep): + protein_accession (pid), + qvalue (q), + posterior_error_prob (pep) + { + } + + bool operator!=(const PercolatorProteinResult& rhs) const + { + if (protein_accession != rhs.protein_accession || qvalue != rhs.qvalue || + posterior_error_prob != rhs.posterior_error_prob) + return true; + return false; + } + + bool operator==(const PercolatorProteinResult& rhs) const + { + return !(operator !=(rhs)); + } + }; + + void registerOptionsAndFlags_() + { + bool is_required = true; + bool is_advanced_option = true; + registerInputFileList_("in", "", StringList(), "Input file(s)", is_required); + setValidFormats_("in", ListUtils::create("mzid,idXML")); + registerInputFileList_("in_decoy", "", StringList(), "Input decoy file(s) in case of separate searches", !is_required); + setValidFormats_("in_decoy", ListUtils::create("mzid,idXML")); + registerOutputFile_("out", "", "", "Output file in idXML format", !is_required); + registerOutputFile_("mzid_out", "", "", "Output file in mzid format", !is_required); + String enzs = "no_enzyme,elastase,pepsin,proteinasek,thermolysin,chymotrypsin,lys-n,lys-c,arg-c,asp-n,glu-c,trypsin"; + registerStringOption_("enzyme", "", "trypsin", "Type of enzyme: "+enzs , !is_required); + setValidStrings_("enzyme", ListUtils::create(enzs)); + registerInputFile_("percolator_executable", "", + // choose the default value according to the platform where it will be executed + #ifdef OPENMS_WINDOWSPLATFORM + "percolator.exe", + #else + "percolator", + #endif + "Percolator executable of the installation e.g. 'percolator.exe'", is_required, !is_advanced_option, ListUtils::create("skipexists") + ); + registerFlag_("peptide-level-fdrs", "Calculate peptide-level FDRs instead of PSM-level FDRs."); + registerFlag_("protein-level-fdrs", "Use the picked protein-level FDR to infer protein probabilities. Use the -fasta option and -decoy-pattern to set the Fasta file and decoy pattern."); + + //Advanced parameters + registerFlag_("generic-feature-set", "Use only generic (i.e. not search engine specific) features. Generating search engine specific features for common search engines by PSMFeatureExtractor will typically boost the identification rate significantly.", is_advanced_option); + registerIntOption_("subset-max-train", "", 0, "Only train an SVM on a subset of PSMs, and use the resulting score vector to evaluate the other PSMs. Recommended when analyzing huge numbers (>1 million) of PSMs. When set to 0, all PSMs are used for training as normal.", !is_required, is_advanced_option); + registerDoubleOption_("cpos", "", 0.0, "Cpos, penalty for mistakes made on positive examples. Set by cross validation if not specified.", !is_required, is_advanced_option); + registerDoubleOption_("cneg", "", 0.0, "Cneg, penalty for mistakes made on negative examples. Set by cross validation if not specified.", !is_required, is_advanced_option); + registerDoubleOption_("testFDR", "", 0.01, "False discovery rate threshold for evaluating best cross validation result and the reported end result.", !is_required, is_advanced_option); + registerDoubleOption_("trainFDR", "", 0.01, "False discovery rate threshold to define positive examples in training. Set to testFDR if 0.", !is_required, is_advanced_option); + registerIntOption_("maxiter", "", 10, "Maximal number of iterations", !is_required, is_advanced_option); + registerFlag_("quick-validation", "Quicker execution by reduced internal cross-validation.", is_advanced_option); + registerOutputFile_("weights", "", "", "Output final weights to the given file", !is_required, is_advanced_option); + registerInputFile_("init-weights", "", "", "Read initial weights to the given file", !is_required, is_advanced_option); + registerStringOption_("default-direction", "", "", "The most informative feature given as the feature name, can be negated to indicate that a lower value is better.", !is_required, is_advanced_option); + registerIntOption_("verbose", "", 2, "Set verbosity of output: 0=no processing info, 5=all.", !is_required, is_advanced_option); + registerFlag_("unitnorm", "Use unit normalization [0-1] instead of standard deviation normalization", is_advanced_option); + registerFlag_("test-each-iteration", "Measure performance on test set each iteration", is_advanced_option); + registerFlag_("override", "Override error check and do not fall back on default score vector in case of suspect score vector", is_advanced_option); + registerIntOption_("seed", "", 1, "Setting seed of the random number generator.", !is_required, is_advanced_option); + registerIntOption_("doc", "", 0, "Include description of correct features", !is_required, is_advanced_option); + registerFlag_("klammer", "Retention time features calculated as in Klammer et al. Only available if -doc is set", is_advanced_option); + registerInputFile_("fasta", "", "", "Provide the fasta file as the argument to this flag, which will be used for protein grouping based on an in-silico digest (only valid if option -protein-level-fdrs is active).", !is_required, is_advanced_option); + setValidFormats_("fasta", ListUtils::create("FASTA")); + registerStringOption_("decoy-pattern", "", "random", "Define the text pattern to identify the decoy proteins and/or PSMs, set this up if the label that identifies the decoys in the database is not the default (Only valid if option -protein-level-fdrs is active).", !is_required, is_advanced_option); + registerFlag_("post-processing-tdc", "Use target-decoy competition to assign q-values and PEPs.", is_advanced_option); + } + + // TODO replace with TopPerc::getScanMergeKey + String getScanIdentifier_(vector::iterator it, vector::iterator start) + { + // MSGF+ uses this field, is empty if not specified + String scan_identifier = it->getMetaValue("spectrum_reference"); + if (scan_identifier.empty()) + { + // XTandem uses this (integer) field + // these ids are 1-based in contrast to the index which is 0-based. This might be problematic to use for merging + if (it->metaValueExists("spectrum_id") && !it->getMetaValue("spectrum_id").toString().empty()) + { + scan_identifier = "scan=" + it->getMetaValue("spectrum_id").toString(); + } + else + { + scan_identifier = "index=" + String(it - start + 1); + LOG_WARN << "no known spectrum identifiers, using index [1,n] - use at own risk." << endl; + } + } + return scan_identifier.removeWhitespaces(); + } + + // TODO replace with TopPerc::getScanMergeKey + Int getScanNumber_(String scan_identifier) + { + Int scan_number = 0; + StringList fields = ListUtils::create(scan_identifier); + for (StringList::const_iterator it = fields.begin(); it != fields.end(); ++it) + { + // if scan number is not available, use the scan index + Size idx = 0; + if ((idx = it->find("scan=")) != string::npos) + { + scan_number = it->substr(idx + 5).toInt(); + break; + } + else if ((idx = it->find("index=")) != string::npos) + { + scan_number = it->substr(idx + 6).toInt(); + } + } + return scan_number; + } + + // Function adapted from Enzyme.h in Percolator converter + bool isEnz_(const char& n, const char& c, string& enz) + { + if (enz == "trypsin") + { + return ((n == 'K' || n == 'R') && c != 'P') || n == '-' || c == '-'; + } + else if (enz == "chymotrypsin") + { + return ((n == 'F' || n == 'W' || n == 'Y' || n == 'L') && c != 'P') || n == '-' || c == '-'; + } + else if (enz == "thermolysin") + { + return ((c == 'A' || c == 'F' || c == 'I' || c == 'L' || c == 'M' + || c == 'V' || (n == 'R' && c == 'G')) && n != 'D' && n != 'E') || n == '-' || c == '-'; + } + else if (enz == "proteinasek") + { + return (n == 'A' || n == 'E' || n == 'F' || n == 'I' || n == 'L' + || n == 'T' || n == 'V' || n == 'W' || n == 'Y') || n == '-' || c == '-'; + } + else if (enz == "pepsin") + { + return ((c == 'F' || c == 'L' || c == 'W' || c == 'Y' || n == 'F' + || n == 'L' || n == 'W' || n == 'Y') && n != 'R') || n == '-' || c == '-'; + } + else if (enz == "elastase") + { + return ((n == 'L' || n == 'V' || n == 'A' || n == 'G') && c != 'P') + || n == '-' || c == '-'; + } + else if (enz == "lys-n") + { + return (c == 'K') + || n == '-' || c == '-'; + } + else if (enz == "lys-c") + { + return ((n == 'K') && c != 'P') + || n == '-' || c == '-'; + } + else if (enz == "arg-c") + { + return ((n == 'R') && c != 'P') + || n == '-' || c == '-'; + } + else if (enz == "asp-n") + { + return (c == 'D') + || n == '-' || c == '-'; + } + else if (enz == "glu-c") + { + return ((n == 'E') && (c != 'P')) + || n == '-' || c == '-'; + } + else + { + return true; + } + } + + // Function adapted from Enzyme.h in Percolator converter + Size countEnzymatic_(String peptide, string& enz) + { + Size count = 0; + for (Size ix = 1; ix < peptide.size(); ++ix) + { + if (isEnz_(peptide[ix - 1], peptide[ix], enz)) + { + ++count; + } + } + return count; + } + + //id label scannr calcmass expmass feature1 ... featureN peptide proteinId1 .. proteinIdM + void preparePin_(vector& peptide_ids, StringList& feature_set, std::string& enz, TextFile& txt, int min_charge, int max_charge) + { + std::cout << "number of features = " << feature_set.size() << "\n"; + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + { + String scan_identifier = getScanIdentifier_(it, peptide_ids.begin()); + Int scan_number = getScanNumber_(scan_identifier); + + double exp_mass = it->getMZ(); + for (vector::const_iterator jt = it->getHits().begin(); jt != it->getHits().end(); ++jt) + { + PeptideHit hit(*jt); // make a copy of the hit to store temporary features + hit.setMetaValue("SpecId", scan_identifier); + hit.setMetaValue("ScanNr", scan_number); + + if (!hit.metaValueExists("target_decoy") || hit.getMetaValue("target_decoy").toString().empty()) continue; + + int label = 1; + if (String(hit.getMetaValue("target_decoy")).hasSubstring("decoy")) + { + label = -1; + } + hit.setMetaValue("Label", label); + + int charge = hit.getCharge(); + String unmodified_sequence = hit.getSequence().toUnmodifiedString(); + + double calc_mass = hit.getSequence().getMonoWeight(Residue::Full, charge)/charge; + hit.setMetaValue("CalcMass", calc_mass); + + + hit.setMetaValue("ExpMass", exp_mass); + hit.setMetaValue("mass", exp_mass); + + double score = hit.getScore(); + hit.setMetaValue("score", score); + + int peptide_length = unmodified_sequence.size(); + hit.setMetaValue("peplen", peptide_length); + + for (int i = min_charge; i <= max_charge; ++i) + { + hit.setMetaValue("charge" + String(i), charge == i); + } + + bool enzN = isEnz_(hit.getPeptideEvidences().front().getAABefore(), unmodified_sequence.prefix(1)[0], enz); + hit.setMetaValue("enzN", enzN); + bool enzC = isEnz_(unmodified_sequence.suffix(1)[0], hit.getPeptideEvidences().front().getAAAfter(), enz); + hit.setMetaValue("enzC", enzC); + int enzInt = countEnzymatic_(unmodified_sequence, enz); + hit.setMetaValue("enzInt", enzInt); + + double delta_mass = exp_mass - calc_mass; + hit.setMetaValue("dm", delta_mass); + + double abs_delta_mass = abs(delta_mass); + hit.setMetaValue("absdm", abs_delta_mass); + + //peptide + String sequence = ""; + // just first peptide evidence + String aa_before(hit.getPeptideEvidences().front().getAABefore()); + String aa_after(hit.getPeptideEvidences().front().getAAAfter()); + aa_before = aa_before=="["?'-':aa_before; + aa_after = aa_after=="]"?'-':aa_after; + sequence += aa_before; + sequence += "." + hit.getSequence().toString() + "."; + sequence += aa_after; + hit.setMetaValue("Peptide", sequence); + + //proteinId1 + StringList proteins; + for (vector::const_iterator kt = hit.getPeptideEvidences().begin(); kt != hit.getPeptideEvidences().end(); ++kt) + { + proteins.push_back(kt->getProteinAccession()); + } + hit.setMetaValue("Proteins", ListUtils::concatenate(proteins, '\t')); + + StringList feats; + for (vector::const_iterator feat = feature_set.begin(); feat != feature_set.end(); ++feat) + { + // Some Hits have no NumMatchedMainIons, and MeanError, etc. values. Have to ignore them! + if (hit.metaValueExists(*feat)) + { + feats.push_back(hit.getMetaValue(*feat).toString()); + } + } + if (feats.size() == feature_set.size()) + { // only if all feats were present add + txt.addLine(ListUtils::concatenate(feats, '\t')); + } + } + } + } + + void readPoutAsMap_(String pout_file, std::map& pep_map) + { + CsvFile csv_file(pout_file, '\t'); + StringList row; + + for (Size i = 1; i < csv_file.rowCount(); ++i) + { + csv_file.getRow(i, row); + PercolatorResult res(row); + String spec_ref = res.PSMId + res.peptide; + // retain only the best result in the unlikely case that a PSMId+peptide combination occurs multiple times + if (pep_map.find(spec_ref) == pep_map.end()) + { + pep_map.insert( map::value_type ( spec_ref, res ) ); + } + } + } + + void readProteinPoutAsMap_(String pout_protein_file, std::map& protein_map) + { + CsvFile csv_file(pout_protein_file, '\t'); + StringList row; + + for (Size i = 1; i < csv_file.rowCount(); ++i) + { + csv_file.getRow(i, row); + StringList protein_accessions; + row[0].split(",", protein_accessions); + double qvalue = row[2].toDouble(); + double posterior_error_prob = row[3].toDouble(); + for (StringList::iterator it = protein_accessions.begin(); it != protein_accessions.end(); ++it) + { + protein_map.insert( map::value_type ( *it, PercolatorProteinResult(*it, qvalue, posterior_error_prob ) ) ); + } + } + } + + ExitCodes readInputFiles_(StringList in_list, vector& all_peptide_ids, vector& all_protein_ids, bool isDecoy, bool& found_decoys, int& min_charge, int& max_charge) + { + for (StringList::iterator fit = in_list.begin(); fit != in_list.end(); ++fit) + { + String file_idx(distance(in_list.begin(), fit)); + vector peptide_ids; + vector protein_ids; + String in = *fit; + FileHandler fh; + FileTypes::Type in_type = fh.getType(in); + LOG_INFO << "Loading input file: " << in << endl; + if (in_type == FileTypes::IDXML) + { + IdXMLFile().load(in, protein_ids, peptide_ids); + } + else if (in_type == FileTypes::MZIDENTML) + { + LOG_WARN << "Converting from mzid: possible loss of information depending on target format." << endl; + MzIdentMLFile().load(in, protein_ids, peptide_ids); + } + //else catched by TOPPBase:registerInput being mandatory mzid or idxml + + //being paranoid about the presence of target decoy denominations, which are crucial to the percolator process + for (vector::iterator pit = peptide_ids.begin(); pit != peptide_ids.end(); ++pit) + { + if (in_list.size() > 1) + { + String scan_identifier = getScanIdentifier_(pit, peptide_ids.begin()); + scan_identifier = "file=" + file_idx + "," + scan_identifier; + pit->setMetaValue("spectrum_reference", scan_identifier); + } + for (vector::iterator pht = pit->getHits().begin(); pht != pit->getHits().end(); ++pht) + { + if (!pht->metaValueExists("target_decoy")) + { + if (isDecoy) + { + pht->setMetaValue("target_decoy", "decoy"); + found_decoys = true; + } + else + { + pht->setMetaValue("target_decoy", "target"); + } + } + else if (pht->getMetaValue("target_decoy").toString().hasSubstring("decoy")) + { + found_decoys = true; + } + + if (pht->getCharge() > max_charge) + { + max_charge = pht->getCharge(); + } + if (pht->getCharge() < min_charge) + { + min_charge = pht->getCharge(); + } + + // TODO: set min/max scores? + } + } + + //paranoia check if this comes from the same search engine! (only in the first proteinidentification of the first proteinidentifications vector vector) + if (!all_protein_ids.empty()) { + if (protein_ids.front().getSearchEngine() != all_protein_ids.front().getSearchEngine()) + { + writeLog_("Input files are not all from the same search engine: " + protein_ids.front().getSearchEngine() + " and " + all_protein_ids.front().getSearchEngine() + ". Use TOPP_PSMFeatureExtractor to merge results from different search engines if desired. Aborting!"); + return INCOMPATIBLE_INPUT_DATA; + } + + bool identical_extra_features = true; + ProteinIdentification::SearchParameters all_search_parameters = all_protein_ids.front().getSearchParameters(); + ProteinIdentification::SearchParameters search_parameters = protein_ids.front().getSearchParameters(); + if (all_search_parameters.metaValueExists("extra_features")) + { + StringList all_search_feature_list = ListUtils::create(all_search_parameters.getMetaValue("extra_features").toString()); + set all_search_feature_set(all_search_feature_list.begin(),all_search_feature_list.end()); + if (search_parameters.metaValueExists("extra_features")) + { + StringList search_feature_list = ListUtils::create(search_parameters.getMetaValue("extra_features").toString()); + set search_feature_set(search_feature_list.begin(), search_feature_list.end()); + identical_extra_features = (search_feature_set == all_search_feature_set); + } + else + { + identical_extra_features = false; + } + } + if (!identical_extra_features) + { + writeLog_("Input files do not have the same set of extra features from TOPP_PSMFeatureExtractor. Aborting!"); + return INCOMPATIBLE_INPUT_DATA; + } + + if (protein_ids.front().getScoreType() != all_protein_ids.front().getScoreType() ) + { + LOG_WARN << "Warning: differing ScoreType between input files" << endl; + } + if (search_parameters.digestion_enzyme != all_search_parameters.digestion_enzyme ) + { + LOG_WARN << "Warning: differing DigestionEnzyme between input files" << endl; + } + if (search_parameters.variable_modifications != all_search_parameters.variable_modifications ) + { + LOG_WARN << "Warning: differing VarMods between input files" << endl; + } + if (search_parameters.fixed_modifications != all_search_parameters.fixed_modifications ) + { + LOG_WARN << "Warning: differing FixMods between input files" << endl; + } + if (search_parameters.charges != all_search_parameters.charges ) + { + LOG_WARN << "Warning: differing SearchCharges between input files" << endl; + } + if (search_parameters.fragment_mass_tolerance != all_search_parameters.fragment_mass_tolerance ) + { + LOG_WARN << "Warning: differing FragTol between input files" << endl; + } + if (search_parameters.precursor_mass_tolerance != all_search_parameters.precursor_mass_tolerance ) + { + LOG_WARN << "Warning: differing PrecTol between input files" << endl; + } + } + LOG_INFO << "Merging peptide ids." << endl; + all_peptide_ids.insert(all_peptide_ids.end(), peptide_ids.begin(), peptide_ids.end()); + LOG_INFO << "Merging protein ids." << endl; + PercolatorFeatureSetHelper::mergeMULTISEProteinIds(all_protein_ids, protein_ids); + } + return EXECUTION_OK; + } + + ExitCodes main_(int, const char**) + { + //------------------------------------------------------------- + // general variables and data to perform PercolatorAdapter + //------------------------------------------------------------- + vector all_peptide_ids; + vector all_protein_ids; + + //------------------------------------------------------------- + // parsing parameters + //------------------------------------------------------------- + const StringList in_list = getStringList_("in"); + const StringList in_decoy = getStringList_("in_decoy"); + LOG_DEBUG << "Input file (of target?): " << ListUtils::concatenate(in_list, ",") << " & " << ListUtils::concatenate(in_decoy, ",") << " (decoy)" << endl; + + const String percolator_executable(getStringOption_("percolator_executable")); + writeDebug_(String("Path to the percolator: ") + percolator_executable, 2); + if (percolator_executable.empty()) //TODO? - TOPPBase::findExecutable after registerInputFile_("percolator_executable"... ??? + { + writeLog_("No percolator executable specified. Aborting!"); + printUsage_(); + return ILLEGAL_PARAMETERS; + } + + const String mzid_out(getStringOption_("mzid_out")); + const String out(getStringOption_("out")); + if (mzid_out.empty() && out.empty()) + { + writeLog_("Fatal error: no output file given (parameter 'out' or 'mzid_out')"); + printUsage_(); + return ILLEGAL_PARAMETERS; + } + + bool peptide_level_fdrs = getFlag_("peptide-level-fdrs"); + bool protein_level_fdrs = getFlag_("protein-level-fdrs"); + + //------------------------------------------------------------- + // read input + //------------------------------------------------------------- + + //TODO introduce min/max charge to parameters for now take available range + int max_charge = 0; + int min_charge = 10; + bool is_decoy = false; + bool found_decoys = false; + ExitCodes read_exit = readInputFiles_(in_list, all_peptide_ids, all_protein_ids, is_decoy, found_decoys, min_charge, max_charge); + if (read_exit != EXECUTION_OK) + { + return read_exit; + } + + if (!in_decoy.empty()) + { + is_decoy = true; + read_exit = readInputFiles_(in_decoy, all_peptide_ids, all_protein_ids, is_decoy, found_decoys, min_charge, max_charge); + if (read_exit != EXECUTION_OK) + { + return read_exit; + } + } + LOG_DEBUG << "Using min/max charges of " << min_charge << "/" << max_charge << endl; + + if (!found_decoys) + { + writeLog_("No decoys found, search results discrimination impossible. Aborting!"); + printUsage_(); + return INCOMPATIBLE_INPUT_DATA; + } + + if (all_peptide_ids.empty()) + { + writeLog_("No peptide hits found in input file. Aborting!"); + printUsage_(); + return INPUT_FILE_EMPTY; + } + + if (all_protein_ids.empty()) + { + writeLog_("No protein hits found in input file. Aborting!"); + printUsage_(); + return INPUT_FILE_EMPTY; + } + + //------------------------------------------------------------- + // prepare pin + //------------------------------------------------------------- + + StringList feature_set; + feature_set.push_back("SpecId"); + feature_set.push_back("Label"); + feature_set.push_back("ScanNr"); + feature_set.push_back("ExpMass"); + feature_set.push_back("CalcMass"); + feature_set.push_back("mass"); + feature_set.push_back("peplen"); + for (int i = min_charge; i <= max_charge; ++i) + { + feature_set.push_back("charge" + String(i)); + } + feature_set.push_back("enzN"); + feature_set.push_back("enzC"); + feature_set.push_back("enzInt"); + feature_set.push_back("dm"); + feature_set.push_back("absdm"); + + ProteinIdentification::SearchParameters search_parameters = all_protein_ids.front().getSearchParameters(); + if (search_parameters.metaValueExists("extra_features")) + { + StringList extra_feature_set = ListUtils::create(search_parameters.getMetaValue("extra_features").toString()); + feature_set.insert(feature_set.end(), extra_feature_set.begin(), extra_feature_set.end()); + } + else if (getFlag_("generic-feature-set")) + { + feature_set.push_back("score"); + } + else + { + writeLog_("No search engine specific features found. Generate search engine specific features using PSMFeatureExtractor or set the -generic-features-set flag to override. Aborting!"); + printUsage_(); + return INCOMPATIBLE_INPUT_DATA; + } + + feature_set.push_back("Peptide"); + feature_set.push_back("Proteins"); + + string enz_str = getStringOption_("enzyme"); + + // create temp directory to store percolator in file pin.tab temporarily + String temp_directory_body = QDir::toNativeSeparators((File::getTempDirectory() + "/" + File::getUniqueName() + "/").toQString()); // body for the tmp files + { + QDir d; + d.mkpath(temp_directory_body.toQString()); + } + String txt_designator = File::getUniqueName(); + String pin_file(temp_directory_body + txt_designator + "_pin.tab"); + String pout_target_file(temp_directory_body + txt_designator + "_target_pout_psms.tab"); + String pout_decoy_file(temp_directory_body + txt_designator + "_decoy_pout_psms.tab"); + String pout_target_file_peptides(temp_directory_body + txt_designator + "_target_pout_peptides.tab"); + String pout_decoy_file_peptides(temp_directory_body + txt_designator + "_decoy_pout_peptides.tab"); + String pout_target_file_proteins(temp_directory_body + txt_designator + "_target_pout_proteins.tab"); + String pout_decoy_file_proteins(temp_directory_body + txt_designator + "_decoy_pout_proteins.tab"); + + LOG_DEBUG << "Writing percolator input file." << pin_file << endl; + TextFile txt; + txt.addLine(ListUtils::concatenate(feature_set, '\t')); + preparePin_(all_peptide_ids, feature_set, enz_str, txt, min_charge, max_charge); + txt.store(pin_file); + + QStringList arguments; + // Check all set parameters and get them into arguments StringList + { + if (peptide_level_fdrs) + { + arguments << "-r" << pout_target_file_peptides.toQString(); + arguments << "-B" << pout_decoy_file_peptides.toQString(); + } + else + { + arguments << "-U"; + } + arguments << "-m" << pout_target_file.toQString(); + arguments << "-M" << pout_decoy_file.toQString(); + + if (protein_level_fdrs) + { + arguments << "-l" << pout_target_file_proteins.toQString(); + arguments << "-L" << pout_decoy_file_proteins.toQString(); + + String fasta_file = getStringOption_("fasta"); + if (fasta_file.empty()) fasta_file = "auto"; + arguments << "-f" << fasta_file.toQString(); + + String decoy_pattern = getStringOption_("decoy-pattern"); + if (decoy_pattern != "random") arguments << "-P" << decoy_pattern.toQString(); + } + + double cpos = getDoubleOption_("cpos"); + double cneg = getDoubleOption_("cneg"); + if (cpos != 0.0) arguments << "-p" << String(cpos).toQString(); + if (cneg != 0.0) arguments << "-n" << String(cneg).toQString(); + + double train_FDR = getDoubleOption_("trainFDR"); + double test_FDR = getDoubleOption_("testFDR"); + if (train_FDR != 0.01) arguments << "-F" << String(train_FDR).toQString(); + if (test_FDR != 0.01) arguments << "-t" << String(test_FDR).toQString(); + + Int max_iter = getIntOption_("maxiter"); + if (max_iter != 10) arguments << "-i" << String(max_iter).toQString(); + Int subset_max_train = getIntOption_("subset-max-train"); + if (subset_max_train > 0) arguments << "-N" << String(subset_max_train).toQString(); + if (getFlag_("quick-validation")) arguments << "-x"; + if (getFlag_("post-processing-tdc")) arguments << "-Y"; + + String weights_file = getStringOption_("weights"); + String init_weights_file = getStringOption_("init-weights"); + String default_search_direction = getStringOption_("default-direction"); + if (!weights_file.empty()) arguments << "-w" << weights_file.toQString(); + if (!init_weights_file.empty()) arguments << "-W" << init_weights_file.toQString(); + if (!default_search_direction.empty()) arguments << "-V" << default_search_direction.toQString(); + + Int verbose_level = getIntOption_("verbose"); + if (verbose_level != 2) arguments << "-v" << String(verbose_level).toQString(); + if (getFlag_("unitnorm")) arguments << "-u"; + if (getFlag_("test-each-iteration")) arguments << "-R"; + if (getFlag_("override")) arguments << "-O"; + + Int seed = getIntOption_("seed"); + if (seed != 1) arguments << "-S" << String(seed).toQString(); + if (getFlag_("klammer")) arguments << "-K"; + + Int description_of_correct = getIntOption_("doc"); + if (description_of_correct != 0) arguments << "-D" << String(description_of_correct).toQString(); + + arguments << pin_file.toQString(); + } + writeLog_("Prepared percolator input."); + + //------------------------------------------------------------- + // run percolator + //------------------------------------------------------------- + // Percolator execution with the executable ant the arguments StringList + int status = QProcess::execute(percolator_executable.toQString(), arguments); // does automatic escaping etc... + if (status != 0) + { + writeLog_("Percolator problem. Aborting! Calling command was: '" + percolator_executable + " \"" + arguments.join("-").toStdString() + "\"."); + // clean temporary files + if (this->debug_level_ < 2) + { + File::removeDirRecursively(temp_directory_body); + LOG_WARN << "Set debug level to >=2 to keep the temporary files at '" << temp_directory_body << "'" << endl; + } + else + { + LOG_WARN << "Keeping the temporary files at '" << temp_directory_body << "'. Set debug level to <2 to remove them." << endl; + } + return EXTERNAL_PROGRAM_ERROR; + } + writeLog_("Executed percolator!"); + + + //------------------------------------------------------------- + // reintegrate pout results + //------------------------------------------------------------- + // when percolator finished calculation, it stores the results -r option (with or without -U) or -m (which seems to be not working) + // WARNING: The -r option cannot be used in conjunction with -U: no peptide level statistics are calculated, redirecting PSM level statistics to provided file instead. + map pep_map; + if (peptide_level_fdrs) + { + readPoutAsMap_(pout_target_file_peptides, pep_map); + readPoutAsMap_(pout_decoy_file_peptides, pep_map); + } + else + { + readPoutAsMap_(pout_target_file, pep_map); + readPoutAsMap_(pout_decoy_file, pep_map); + } + + map protein_map; + if (protein_level_fdrs) + { + readProteinPoutAsMap_(pout_target_file_proteins, protein_map); + readProteinPoutAsMap_(pout_decoy_file_proteins, protein_map); + } + + // As the percolator output file is not needed anymore, the temporary directory is going to be deleted + if (this->debug_level_ < 5) + { + File::removeDirRecursively(temp_directory_body); + LOG_WARN << "Removing temporary directory for Percolator in/output. Set debug level to >=5 to keep the temporary files." << endl; + } + else + { + LOG_WARN << "Keeping the temporary files at '" << temp_directory_body << "'. Set debug level to <5 to remove them." << endl; + } + + // Add the percolator results to the peptide vector of the original input file + //size_t c_debug = 0; + size_t cnt = 0; + String run_identifier = all_protein_ids.front().getIdentifier(); + for (vector::iterator it = all_peptide_ids.begin(); it != all_peptide_ids.end(); ++it) + { + it->setIdentifier(run_identifier); + it->setScoreType("q-value"); + it->setHigherScoreBetter(false); + + String scan_identifier = getScanIdentifier_(it, all_peptide_ids.begin()); + + //check each PeptideHit for compliance with one of the PercolatorResults (by sequence) + for (vector::iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) + { + String peptide_sequence = hit->getSequence().toString(); + String psm_identifier = scan_identifier + peptide_sequence; + + map::iterator pr = pep_map.find(psm_identifier); + if (pr != pep_map.end()) + { + hit->setMetaValue("MS:1001492", pr->second.score); // svm score + hit->setMetaValue("MS:1001491", pr->second.qvalue); // percolator q value + hit->setMetaValue("MS:1001493", pr->second.posterior_error_prob); // percolator pep + hit->setScore(pr->second.qvalue); + ++cnt; + } + else + { + hit->setScore(1.0); // set q-value to 1.0 if hit not found in results + } + } + } + //LOG_INFO << "No suitable PeptideIdentification for " << c_debug << " out of " << all_peptide_ids.size() << endl; + LOG_INFO << "Suitable PeptideHits for " << cnt << " found." << endl; + + // TODO: There should only be 1 ProteinIdentification element in this vector, no need for a for loop + for (vector::iterator it = all_protein_ids.begin(); it != all_protein_ids.end(); ++it) + { + if (protein_level_fdrs) + { + //check each ProteinHit for compliance with one of the PercolatorProteinResults (by accession) + for (vector::iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) + { + String protein_accession = hit->getAccession(); + map::iterator pr = protein_map.find(protein_accession); + if (pr != protein_map.end()) + { + hit->setMetaValue("MS:1001491", pr->second.qvalue); // percolator q value + hit->setMetaValue("MS:1001493", pr->second.posterior_error_prob); // percolator pep + hit->setScore(pr->second.qvalue); + } + else + { + hit->setScore(1.0); // set q-value to 1.0 if hit not found in results + } + } + it->setSearchEngine("Percolator"); + it->setScoreType("q-value"); + it->setHigherScoreBetter(false); + it->sort(); + } + + //TODO add software percolator and PercolatorAdapter + it->setMetaValue("percolator", "PercolatorAdapter"); + ProteinIdentification::SearchParameters search_parameters = it->getSearchParameters(); + + search_parameters.setMetaValue("Percolator:peptide-level-fdrs", peptide_level_fdrs); + search_parameters.setMetaValue("Percolator:protein-level-fdrs", protein_level_fdrs); + search_parameters.setMetaValue("Percolator:generic-feature-set", getFlag_("generic-feature-set")); + search_parameters.setMetaValue("Percolator:testFDR", getDoubleOption_("testFDR")); + search_parameters.setMetaValue("Percolator:trainFDR", getDoubleOption_("trainFDR")); + search_parameters.setMetaValue("Percolator:maxiter", getIntOption_("maxiter")); + search_parameters.setMetaValue("Percolator:subset-max-train", getIntOption_("subset-max-train")); + search_parameters.setMetaValue("Percolator:quick-validation", getFlag_("quick-validation")); + search_parameters.setMetaValue("Percolator:weights", getStringOption_("weights")); + search_parameters.setMetaValue("Percolator:init-weights", getStringOption_("init-weights")); + search_parameters.setMetaValue("Percolator:default-direction", getStringOption_("default-direction")); + search_parameters.setMetaValue("Percolator:cpos", getDoubleOption_("cpos")); + search_parameters.setMetaValue("Percolator:cneg", getDoubleOption_("cneg")); + search_parameters.setMetaValue("Percolator:unitnorm", getFlag_("unitnorm")); + search_parameters.setMetaValue("Percolator:override", getFlag_("override")); + search_parameters.setMetaValue("Percolator:seed", getIntOption_("seed")); + search_parameters.setMetaValue("Percolator:doc", getIntOption_("doc")); + search_parameters.setMetaValue("Percolator:klammer", getFlag_("klammer")); + search_parameters.setMetaValue("Percolator:fasta", getStringOption_("fasta")); + search_parameters.setMetaValue("Percolator:decoy-pattern", getStringOption_("decoy-pattern")); + search_parameters.setMetaValue("Percolator:post-processing-tdc", getFlag_("post-processing-tdc")); + + it->setSearchParameters(search_parameters); + } + + // Storing the PeptideHits with calculated q-value, pep and svm score + if (!mzid_out.empty()) + { + MzIdentMLFile().store(mzid_out.toQString().toStdString(), all_protein_ids, all_peptide_ids); + } + if (!out.empty()) + { + IdXMLFile().store(out.toQString().toStdString(), all_protein_ids, all_peptide_ids); + } + + writeLog_("PercolatorAdapter finished successfully!"); + return EXECUTION_OK; + } + +}; + + +int main(int argc, const char** argv) +{ + PercolatorAdapter tool; + + return tool.main(argc, argv); +} + +/// @endcond diff --git a/src/topp/executables.cmake b/src/topp/executables.cmake index 46bd1a3ef9b..c16006716d8 100644 --- a/src/topp/executables.cmake +++ b/src/topp/executables.cmake @@ -72,6 +72,7 @@ PeakPickerHiRes PeakPickerWavelet PepNovoAdapter PeptideIndexer +PercolatorAdapter PhosphoScoring PrecursorIonSelector PrecursorMassCorrector diff --git a/src/utils/COMETAdapter.cpp b/src/utils/COMETAdapter.cpp new file mode 100755 index 00000000000..193c930b298 --- /dev/null +++ b/src/utils/COMETAdapter.cpp @@ -0,0 +1,551 @@ +// -------------------------------------------------------------------------- +// OpenMS -- Open-Source Mass Spectrometry +// -------------------------------------------------------------------------- +// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, +// ETH Zurich, and Freie Universitaet Berlin 2002-2016. +// +// This software is released under a three-clause BSD license: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of any author or any participating institution +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// For a full list of authors, refer to the file AUTHORS. +// -------------------------------------------------------------------------- +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING +// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// -------------------------------------------------------------------------- +// $Maintainer: Chris Bielow $ +// $Authors: Leon Bichmann, Timo Sachsenberg, Andreas Bertsch, Chris Bielow $ +// -------------------------------------------------------------------------- + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include // std::cout, std::ostream, std::ios +#include + +using namespace OpenMS; +using namespace std; + +//------------------------------------------------------------- +//Doxygen docu +//------------------------------------------------------------- + +/** + @page TOPP_COMETAdapter COMETAdapter + + @brief Identifies peptides in MS/MS spectra via COMET. + +
+ + + + + + + + + + +
pot. predecessor tools \f$ \longrightarrow \f$ COMETAdapter \f$ \longrightarrow \f$ pot. successor tools
any signal-/preprocessing tool @n (in mzML format) @ref TOPP_IDFilter or @n any protein/peptide processing tool
+
+ + @em COMET must be installed before this wrapper can be used. This wrapper + has been successfully tested with several versions of COMET. + + This adapter supports relative database filenames, which (when not found in the current working directory) is looked up in + the directories specified by 'OpenMS.ini:id_db_dir' (see @subpage TOPP_advanced). + + COMET settings not exposed by this adapter can be directly adjusted using an XML configuration file. + By default, all (!) parameters available explicitly via this wrapper take precedence over the XML configuration file. + The parameter "default_config_file" can be used to specify such a custom configuration. + An example of a configuration file (named "default_input.xml") is contained in the "bin" folder of the + @em COMET installation and the OpenMS installation under OpenMS/share/CHEMISTRY/COMET_default_input.xml. + The latter is loaded by default. + If you want to use the XML configuration file and @em ignore most of the parameters set via this adapter, use the '-ignore_adapter_param' + flag. Then, the config given in '-default_config_file' is used exclusively and only '-in', '-out', '-database' and '-COMET_executable' are + taken from this adapter. + + @note Currently mzIdentML (mzid) is not directly supported as an input/output format of this tool. Convert mzid files to/from idXML using @ref TOPP_IDFileConverter if necessary. + + The command line parameters of this tool are: + @verbinclude TOPP_COMETAdapter.cli + INI file documentation of this tool: + @htmlinclude TOPP_COMETAdapter.html +*/ + +// We do not want this class to show up in the docu: +/// @cond TOPPCLASSES + + +class TOPPCOMETAdapter : + public TOPPBase +{ +public: + TOPPCOMETAdapter() : + TOPPBase("COMETAdapter", "Annotates MS/MS spectra using COMET.") + { + } + +protected: + void registerOptionsAndFlags_() + { + + registerInputFile_("in", "", "", "Input file"); + setValidFormats_("in", ListUtils::create("mzML")); + registerOutputFile_("out", "", "", "Output file"); + setValidFormats_("out", ListUtils::create("idXML")); + registerOutputFile_("pin_out", "", "", "Output file - for percolator input"); + setValidFormats_("pin_out", ListUtils::create("pin"), false); + registerInputFile_("database", "", "", "FASTA file or pro file. Non-existing relative file-names are looked up via'OpenMS.ini:id_db_dir'", true, false, ListUtils::create("skipexists")); + setValidFormats_("database", ListUtils::create("FASTA")); + registerInputFile_("comet_executable", "", + // choose the default value according to the platform where it will be executed + // COMET compiles as tandem on OSX and tandem.exe on any other platform + "comet.exe", + "Comet executable of the installation e.g. 'comet.exe'", true, false, ListUtils::create("skipexists")); + + addEmptyLine_(); + // + // Optional parameters (if '-ignore_adapter_param' is set) + // + + registerDoubleOption_("peptide_mass_tolerance", "", 10.0, "peptide_mass_tolerance", false); + registerDoubleOption_("fragment_mass_tolerance", "", 0.3, "Fragment mass error", false); + + registerIntOption_("peptide_mass_units", "", 2, "0=amu, 1=mmu, 2=ppm", false); + registerStringOption_("precursor_error_units", "", "ppm", "Parent monoisotopic mass error units", false); + registerStringOption_("fragment_error_units", "", "Da", "Fragment monoisotopic mass error units", false); + vector valid_strings = ListUtils::create("ppm,Da"); + + registerIntOption_("mass_type_parent", "", 1, "0=average masses, 1=monoisotopic masses", false); + registerIntOption_("mass_type_fragment", "", 1, "0=average masses, 1=monoisotopic masses", false); + registerIntOption_("precursor_tolerance_type", "", 0, "0=average masses, 1=monoisotopic masses", false); + registerIntOption_("isotope_error", "", 0, "0=off, 1=on -1/0/1/2/3 (standard C13 error), 2= -8/-4/0/4/8 (for +4/+8 labeling)", false); + + registerIntOption_("num_enzyme_termini", "", 2, "1 (semi-digested), 2 (fully digested, default), 8 C-term unspecific , 9 N-term unspecific", false); + registerIntOption_("allowed_missed_cleavages", "", 1, "Number of possible cleavage sites missed by the enzyme, maximum value is 5; for enzyme search", false); + + registerStringOption_("decoy_prefix", "", "rev_", "decoy entries are denoted by this string which is pre-pended to each protein accession", false); + + vector all_mods; + ModificationsDB::getInstance()->getAllSearchModifications(all_mods); + registerStringList_("variable_modifications", "", ListUtils::create(""), "Variable modifications, specified using UniMod (www.unimod.org) terms, e.g. 'Carbamidomethyl (C)' or 'Oxidation (M)'", false); + setValidStrings_("variable_modifications", all_mods); + + addEmptyLine_(); + + vector all_enzymes; + EnzymesDB::getInstance()->getAllXTandemNames(all_enzymes); + registerStringOption_("cleavage_site", "", "Trypsin", "The enzyme used for peptide digestion.", false); + setValidStrings_("cleavage_site", all_enzymes); + } + + vector getModifications_(StringList modNames) + { + vector modifications; + + // iterate over modification names and add to vector + for (StringList::iterator mod_it = modNames.begin(); mod_it != modNames.end(); ++mod_it) + { + String modification(*mod_it); + modifications.push_back(ModificationsDB::getInstance()->getModification(modification)); + } + + return modifications; + } + + void removeTempDir_(const String& tmp_dir) + { + if (tmp_dir.empty()) return; // no temp. dir. created + + if (debug_level_ >= 2) + { + writeDebug_("Keeping temporary files in directory '" + tmp_dir + "'. Set debug level to 1 or lower to remove them.", 2); + } + else + { + if (debug_level_ == 1) writeDebug_("Deleting temporary directory '" + tmp_dir + "'. Set debug level to 2 or higher to keep it.", 1); + File::removeDirRecursively(tmp_dir); + } + } + + void createParamFile_(ostream& os) + { + os << "# comet_version 2016.01 rev. 2\n"; //required as first line in the param file + os << "# Comet MS/MS search engine parameters file.\n"; + os << "# Everything following the '#' symbol is treated as a comment.\n"; + os << "database_name = " << getStringOption_("database") << "\n"; + os << "decoy_search = " << 0 << "\n"; // 0=no (default), 1=concatenated search, 2=separate search + os << "num_threads = " << getIntOption_("threads") << "\n"; // 0=poll CPU to set num threads; else specify num threads directly (max 64) + + // masses + os << "peptide_mass_tolerance = " << getDoubleOption_("peptide_mass_tolerance") << "\n"; + os << "peptide_mass_units = " << getIntOption_("peptide_mass_units") << "\n"; // 0=amu, 1=mmu, 2=ppm + os << "mass_type_parent = " << getIntOption_("mass_type_parent") << "\n"; // 0=average masses, 1=monoisotopic masses + os << "mass_type_fragment = " << getIntOption_("mass_type_fragment") << "\n"; // 0=average masses, 1=monoisotopic masses + os << "precursor_tolerance_type = " << getIntOption_("precursor_tolerance_type") << "\n"; // 0=MH+ (default), 1=precursor m/z; only valid for amu/mmu tolerances + os << "isotope_error = " << getIntOption_("isotope_error") << "\n"; // 0=off, 1=on -1/0/1/2/3 (standard C13 error), 2= -8/-4/0/4/8 (for +4/+8 labeling) + + // search enzyme + + // TODO: complete + map map_oms2comet; + map_oms2comet["Trypsin"] = 1; + map_oms2comet["Arg-C"] = 5; + map_oms2comet["Asp-N"] = 6; + map_oms2comet["Chymotrypsin"] = 10; + map_oms2comet["CNBr"] = 7; + map_oms2comet["Lys-C"] = 3; + map_oms2comet["PepsinA"] = 9; + map_oms2comet["Trypsin/P"] = 2; + map_oms2comet["no cleavage"] = 0; + + String enzyme_name = getStringOption_("cleavage_site"); + Size enzyme_number = 1; + if (map_oms2comet.find(enzyme_name) != map_oms2comet.end()) + { + enzyme_number = map_oms2comet.at(enzyme_name); + } + else + { + throw OpenMS::Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Error: Enzyme not supported. " + enzyme_name); + } + + os << "search_enzyme_number = " << enzyme_number << "\n"; // choose from list at end of this params file + os << "num_enzyme_termini = " << getIntOption_("num_enzyme_termini") << "\n"; // 1 (semi-digested), 2 (fully digested, default), 8 C-term unspecific , 9 N-term unspecific + os << "allowed_missed_cleavage = " << getIntOption_("allowed_missed_cleavages") << "\n"; // maximum value is 5; for enzyme search + + // Up to 9 variable modifications are supported + // format: <0=variable/else binary> + // e.g. 79.966331 STY 0 3 -1 0 0 + vector variable_modifications_names = getStringList_("variable_modifications"); + vector variable_modifications = getModifications_(variable_modifications_names); + if (variable_modifications.size() > 9) + { + throw OpenMS::Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Error: Comet only supports 9 variable modifications. " + String(variable_modifications.size()) + " provided."); + } + + Size var_mod_index = 1; + + // write out user specified modifications + for (; var_mod_index <= variable_modifications.size(); ++var_mod_index) + { + const ResidueModification mod = variable_modifications[var_mod_index]; + double mass = mod.getDiffMonoMass(); + String residues = mod.getOrigin(); // TODO: check if origin contains C-term string or similar. Should not be passed to commet as residue string + String variable = "0"; + String max_mods_per_peptide = "3"; + String term_distance = "-1"; + String nc_term = "0"; + + if (mod.getTermSpecificity() == ResidueModification::C_TERM) + { + term_distance = 0; + nc_term = "3"; + } + else if (mod.getTermSpecificity() == ResidueModification::N_TERM) + { + term_distance = 0; + nc_term = "2"; + } + else if (mod.getTermSpecificity() == ResidueModification::PROTEIN_N_TERM) + { + term_distance = 0; + nc_term = "0"; + } + else if (mod.getTermSpecificity() == ResidueModification::PROTEIN_C_TERM) + { + term_distance = 0; + nc_term = "1"; + } + String required = "0"; + + os << "variable_mod0" << var_mod_index << " = " << mass << " " << residues << " " << variable << " " << max_mods_per_peptide << " " << term_distance << " " << nc_term << " " << required << "\n"; + } + + // fill remaining modification slots (if any) in Comet with "no modification" + for (; var_mod_index <= 9; ++var_mod_index) + { + os << "variable_mod0" << var_mod_index << " = " << "0.0 X 0 3 -1 0 0" << "\n"; + } + + os << "max_variable_mods_in_peptide = " << 5 << "\n"; + os << "require_variable_mod = " << 0 << "\n"; + + // fragment ions + // ion trap ms/ms: 1.0005 tolerance, 0.4 offset (mono masses), theoretical_fragment_ions = 1 + // high res ms/ms: 0.02 tolerance, 0.0 offset (mono masses), theoretical_fragment_ions = 0 + os << "fragment_bin_tol = " << 1.0005 << "\n"; // binning to use on fragment ions + os << "fragment_bin_offset = " << 0.4 << "\n"; // offset position to start the binning (0.0 to 1.0) + os << "theoretical_fragment_ions = " << 1 << "\n"; // 0=use flanking peaks, 1=M peak only + os << "use_A_ions = " << 0 << "\n"; + os << "use_B_ions = " << 1 << "\n"; + os << "use_C_ions = " << 0 << "\n"; + os << "use_X_ions = " << 0 << "\n"; + os << "use_Y_ions = " << 1 << "\n"; + os << "use_Z_ions = " << 0 << "\n"; + os << "use_NL_ions = " << 0 << "\n"; // 0=no, 1=yes to consider NH3/H2O neutral loss peaks + + // output + os << "output_sqtstream = " << 0 << "\n"; // 0=no, 1=yes write sqt to standard output + os << "output_sqtfile = " << 0 << "\n"; // 0=no, 1=yes write sqt file + os << "output_txtfile = " << 0 << "\n"; // 0=no, 1=yes write tab-delimited txt file + os << "output_pepxmlfile = " << 1 << "\n"; // 0=no, 1=yes write pep.xml file + + os << "output_percolatorfile = " << !getStringOption_("pin_out").empty() << "\n"; // 0=no, 1=yes write Percolator tab-delimited input file + os << "output_outfiles = " << 0 << "\n"; // 0=no, 1=yes write .out files + os << "print_expect_score = " << 1 << "\n"; // 0=no, 1=yes to replace Sp with expect in out & sqt + os << "num_output_lines = " << 5 << "\n"; // num peptide results to show + os << "show_fragment_ions = " << 0 << "\n"; // 0=no, 1=yes for out files only + os << "sample_enzyme_number = " << 0 << "\n"; // Sample enzyme which is possibly different than the one applied to the search. + + // mzXML parameters + os << "scan_range = " << "0 0" << "\n"; // start and scan scan range to search; 0 as 1st entry ignores parameter + os << "precursor_charge = " << "0 0" << "\n"; // precursor charge range to analyze; does not override any existing charge; 0 as 1st entry ignores parameter + os << "override_charge = " << 0 << "\n"; // 0=no, 1=override precursor charge states, 2=ignore precursor charges outside precursor_charge range, 3=see online + os << "ms_level = " << 2 << "\n"; // MS level to analyze, valid are levels 2 (default) or 3 + os << "activation_method = " << "ALL" << "\n"; // activation method; used if activation method set; allowed ALL, CID, ECD, ETD, PQD, HCD, IRMPD + + // misc parameters + os << "digest_mass_range = " << "600.0 5000.0" << "\n"; // MH+ peptide mass range to analyze + os << "num_results = " << 100 << "\n"; // number of search hits to store internally + os << "skip_researching = " << 1 << "\n"; // for '.out' file output only, 0=search everything again (default), 1=dont search if .out exists + os << "max_fragment_charge = " << 3 << "\n"; // set maximum fragment charge state to analyze (allowed max 5) + os << "max_precursor_charge = " << 4 << "\n"; // set maximum precursor charge state to analyze (allowed max 9) + os << "nucleotide_reading_frame = " << 0 << "\n"; // 0=proteinDB, 1-6, 7=forward three, 8=reverse three, 9=all six + os << "clip_nterm_methionine = " << 0 << "\n"; // 0=leave sequences as-is; 1=also consider sequence w/o N-term methionine + os << "spectrum_batch_size = " << 0 << "\n"; // max. // of spectra to search at a time; 0 to search the entire scan range in one loop + os << "decoy_prefix = " << getStringOption_("decoy_prefix") << "\n"; // decoy entries are denoted by this string which is pre-pended to each protein accession + os << "output_suffix = " << "" << "\n"; // add a suffix to output base names i.e. suffix "-C" generates base-C.pep.xml from base.mzXML input + os << "mass_offsets = " << "" << "\n"; // one or more mass offsets to search (values substracted from deconvoluted precursor mass) + + // spectral processing + os << "minimum_peaks = " << 10 << "\n"; // required minimum number of peaks in spectrum to search (default 10) + os << "minimum_intensity = " << 0 << "\n"; // minimum intensity value to read in + os << "remove_precursor_peak = " << 0 << "\n"; // 0=no, 1=yes, 2=all charge reduced precursor peaks (for ETD) + os << "remove_precursor_tolerance = " << 1.5 << "\n"; // +- Da tolerance for precursor removal + os << "clear_mz_range = " << "0.0 0.0" << "\n"; // for iTRAQ/TMT type data; will clear out all peaks in the specified m/z range + + // additional modifications + os << "add_Cterm_peptide = " << 0.0 << "\n"; + os << "add_Nterm_peptide = " << 0.0 << "\n"; + os << "add_Cterm_protein = " << 0.0 << "\n"; + os << "add_Nterm_protein = " << 0.0 << "\n"; + os << "add_G_glycine = " << 0.0000 << "\n"; // added to G - avg. 57.0513, mono. 57.02146 + os << "add_A_alanine = " << 0.0000 << "\n"; // added to A - avg. 71.0779, mono. 71.03711 + os << "add_S_serine = " << 0.0000 << "\n"; // added to S - avg. 87.0773, mono. 87.03203 + os << "add_P_proline = " << 0.0000 << "\n"; // added to P - avg. 97.1152, mono. 97.05276 + os << "add_V_valine = " << 0.0000 << "\n"; // added to V - avg. 99.1311, mono. 99.06841 + os << "add_T_threonine = " << 0.0000 << "\n"; // added to T - avg. 101.1038, mono. 101.04768 + os << "add_C_cysteine = " << 0.0000 << "\n"; // added to C - avg. 103.1429, mono. 103.00918 + os << "add_L_leucine = " << 0.0000 << "\n"; // added to L - avg. 113.1576, mono. 113.08406 + os << "add_I_isoleucine = " << 0.0000 << "\n"; // added to I - avg. 113.1576, mono. 113.08406 + os << "add_N_asparagine = " << 0.0000 << "\n"; // added to N - avg. 114.1026, mono. 114.04293 + os << "add_D_aspartic_acid = " << 0.0000 << "\n"; // added to D - avg. 115.0874, mono. 115.02694 + os << "add_Q_glutamine = " << 0.0000 << "\n"; // added to Q - avg. 128.1292, mono. 128.05858 + os << "add_K_lysine = " << 0.0000 << "\n"; // added to K - avg. 128.1723, mono. 128.09496 + os << "add_E_glutamic_acid =" << 0.0000 << "\n"; // added to E - avg. 129.1140, mono. 129.04259 + os << "add_M_methionine =" << 0.0000 << "\n"; // added to M - avg. 131.1961, mono. 131.04048 + os << "add_O_ornithine = " << 0.0000 << "\n"; // added to O - avg. 132.1610, mono 132.08988 + os << "add_H_histidine = " << 0.0000 << "\n"; // added to H - avg. 137.1393, mono. 137.05891 + os << "add_F_phenylalanine = " << 0.0000 << "\n"; // added to F - avg. 147.1739, mono. 147.06841 + os << "add_U_selenocysteine = " << 0.0000 << "\n"; // added to U - avg. 150.3079, mono. 150.95363 + os << "add_R_arginine = " << 0.0000 << "\n"; // added to R - avg. 156.1857, mono. 156.10111 + os << "add_Y_tyrosine = " << 0.0000 << "\n"; // added to Y - avg. 163.0633, mono. 163.06333 + os << "add_W_tryptophan = " << 0.0000 << "\n"; // added to W - avg. 186.0793, mono. 186.07931 + os << "add_B_user_amino_acid = " << 0.0000 << "\n"; // added to B - avg. 0.0000, mono. 0.00000 + os << "add_J_user_amino_acid = " << 0.0000 << "\n"; // added to J - avg. 0.0000, mono. 0.00000 + os << "add_X_user_amino_acid = " << 0.0000 << "\n"; // added to X - avg. 0.0000, mono. 0.00000 + os << "add_Z_user_amino_acid = " << 0.0000 << "\n"; // added to Z - avg. 0.0000, mono. 0.00000 + + // COMET_ENZYME_INFO _must_ be at the end of this parameters file + os << "[COMET_ENZYME_INFO]" << "\n"; + os << "0. No_enzyme 0 - -" << "\n"; + os << "1. Trypsin 1 KR P" << "\n"; + os << "2. Trypsin/P 1 KR -" << "\n"; + os << "3. Lys_C 1 K P" << "\n"; + os << "4. Lys_N 0 K -" << "\n"; + os << "5. Arg_C 1 R P" << "\n"; + os << "6. Asp_N 0 D -" << "\n"; + os << "7. CNBr 1 M -" << "\n"; + os << "8. Glu_C 1 DE P" << "\n"; + os << "9. PepsinA 1 FL P" << "\n"; + os << "10. Chymotrypsin 1 FWYL P" << "\n"; + } + + ExitCodes main_(int, const char**) + { + //------------------------------------------------------------- + // parsing parameters + //------------------------------------------------------------- + + String inputfile_name = getStringOption_("in"); + writeDebug_(String("Input file: ") + inputfile_name, 1); + if (inputfile_name == "") + { + writeLog_("No input file specified. Aborting!"); + printUsage_(); + return ILLEGAL_PARAMETERS; + } + + String outputfile_name = getStringOption_("out"); + writeDebug_(String("Output file: ") + outputfile_name, 1); + if (outputfile_name == "") + { + writeLog_("No output file specified. Aborting!"); + printUsage_(); + return ILLEGAL_PARAMETERS; + } + + + //------------------------------------------------------------- + // reading input + //------------------------------------------------------------- + + String db_name(getStringOption_("database")); + if (!File::readable(db_name)) + { + String full_db_name; + try + { + full_db_name = File::findDatabase(db_name); + } + catch (...) + { + printUsage_(); + return ILLEGAL_PARAMETERS; + } + db_name = full_db_name; + } + + //tmp_dir + const String tmp_dir = QDir::toNativeSeparators((File::getTempDirectory() + "/").toQString()); + writeDebug_("Creating temporary directory '" + tmp_dir + "'", 1); + QDir d; + d.mkpath(tmp_dir.toQString()); + + String tmp_file = tmp_dir + "param.txt"; + String tmp_pepxml = File::removeExtension(inputfile_name) + ".pep.xml"; + String tmp_pin = File::removeExtension(inputfile_name) + ".pin"; + + ofstream os(tmp_file); + createParamFile_(os); + os.close(); + + PeakMap exp; + MzMLFile mzml_file; + mzml_file.getOptions().addMSLevel(2); // only load msLevel 2 + mzml_file.setLogType(log_type_); + mzml_file.load(inputfile_name, exp); + + if (exp.getSpectra().empty()) + { + throw OpenMS::Exception::FileEmpty(__FILE__, __LINE__, __FUNCTION__, "Error: No MS2 spectra in input file."); + } + + // determine type of spectral data (profile or centroided) + SpectrumSettings::SpectrumType spectrum_type = exp[0].getType(); + + if (spectrum_type == SpectrumSettings::RAWDATA) + { + if (!getFlag_("force")) + { + throw OpenMS::Exception::IllegalArgument(__FILE__, __LINE__, __FUNCTION__, "Error: Profile data provided but centroided MS2 spectra expected. To enforce processing of the data set the -force flag."); + } + } + + + //------------------------------------------------------------- + // calculations + //------------------------------------------------------------- + + String param = "-P" + tmp_file; + QStringList process_params; + process_params << param.toQString() << inputfile_name.toQString(); + //qDebug() << process_params; + + String comet_executable = getStringOption_("comet_executable"); + //int status = QProcess::execute(comet_executable.toQString(), QStringList(inputfile_name.toQString())); // does automatic escaping etc... + int status = QProcess::execute(comet_executable.toQString(),process_params); // does automatic escaping etc... + if (status != 0) + { + writeLog_("Comet problem. Aborting! Calling command was: '" + comet_executable + " \"" + inputfile_name + "\"'.\nDoes the Comet executable exist?"); + // clean temporary files + if (this->debug_level_ < 2) + { + removeTempDir_(tmp_dir); + LOG_WARN << "Set debug level to >=2 to keep the temporary files at '" << tmp_dir << "'" << std::endl; + } + else + { + LOG_WARN << "Keeping the temporary files at '" << tmp_dir << "'. Set debug level to <2 to remove them." << std::endl; + } + //return EXTERNAL_PROGRAM_ERROR; + } + + // read the pep.xml output of COMET and write it to idXML + + vector peptide_identifications; + vector protein_identifications; + + writeDebug_("write PepXMLFile", 1); + PepXMLFile().load(tmp_pepxml, protein_identifications, peptide_identifications, inputfile_name); + + if (this->debug_level_ == 0) + { + File::remove(tmp_pepxml); + LOG_WARN << "Set debug level to >0 to keep the temporary pep.xml and pin files at '" << tmp_pepxml << "'" << std::endl; + } + else + { + LOG_WARN << "Keeping the temporary files at '" << tmp_pepxml << "'. Set debug level to 0 to remove them." << std::endl; + } + + //------------------------------------------------------------- + // writing output + //------------------------------------------------------------- + + IdXMLFile().store(outputfile_name, protein_identifications, peptide_identifications); + + } + +}; + + +int main(int argc, const char** argv) +{ + TOPPCOMETAdapter tool; + + return tool.main(argc, argv); +} + +/// @endcond diff --git a/src/utils/PSMFeatureExtractor.cpp b/src/utils/PSMFeatureExtractor.cpp new file mode 100644 index 00000000000..a62c8379d2f --- /dev/null +++ b/src/utils/PSMFeatureExtractor.cpp @@ -0,0 +1,304 @@ +// -------------------------------------------------------------------------- +// OpenMS -- Open-Source Mass Spectrometry +// -------------------------------------------------------------------------- +// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, +// ETH Zurich, and Freie Universitaet Berlin 2002-2015. +// +// This software is released under a three-clause BSD license: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of any author or any participating institution +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// For a full list of authors, refer to the file AUTHORS. +// -------------------------------------------------------------------------- +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING +// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// -------------------------------------------------------------------------- +// $Maintainer: Mathias Walzer $ +// $Authors: Andreas Simon, Mathias Walzer, Matthew The $ +// -------------------------------------------------------------------------- +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +using namespace OpenMS; +using namespace std; + +//------------------------------------------------------------- +//Doxygen docu +//------------------------------------------------------------- + +/** + @page UTILS_PSMFeatureExtractor PSMFeatureExtractor + + @brief PSMFeatureExtractor computes extra features for each input PSM + + @experimental Parts of this tool are still work in progress and usage and input requirements or output might change. (multiple_search_engine, Mascot support) + +
+ + + + + + + + + + +
pot. predecessor tools \f$ \longrightarrow \f$ PSMFeatureExtractor \f$ \longrightarrow \f$ pot. successor tools
@ref TOPP_PeptideIndexer @ref TOPP_PercolatorAdapter
+
+ +

+PSMFeatureExtractor is search engine sensitive, i.e. it's extra features +vary, depending on the search engine. Thus, please make sure the input is +compliant with TOPP SearchengineAdapter output. Also, PeptideIndexer compliant +target/decoy annotation is mandatory. +Currently supported search engines are Comet, X!Tandem, MSGF+. +Mascot support is available but in beta development. +

+ + @note if you have extra features you want to pass to percolator, use the extra + flag and list the MetaData entries containing the extra features. + + The command line parameters of this tool are: + @verbinclude UTILS_PSMFeatureExtractor.cli + INI file documentation of this tool: + @htmlinclude UTILS_PSMFeatureExtractor.html +*/ + +// We do not want this class to show up in the docu: +/// @cond TOPPCLASSES + +class PSMFeatureExtractor : + public TOPPBase +{ +public: + PSMFeatureExtractor() : + TOPPBase("PSMFeatureExtractor", "Computes extra features for each input PSM.", false) + { + } + +protected: + void registerOptionsAndFlags_() + { + registerInputFileList_("in", "", StringList(), "Input file(s)", true); + setValidFormats_("in", ListUtils::create("mzid,idXML")); + registerOutputFile_("out", "", "", "Output file in idXML format", false); + registerOutputFile_("mzid_out", "", "", "Output file in mzid format", false); + registerStringList_("extra", "", vector(), "List of the MetaData parameters to be included in a feature set for precolator.", false, false); + // setValidStrings_("extra", ?); + // TODO: add this MHC feature back in with TopPerc::hasMHCEnd_() + //registerFlag_("MHC", "Add a feature for MHC ligand properties to the specific PSM.", true); + registerFlag_("multiple_search_engines", "Combine PSMs from different search engines by merging on scan level."); + registerFlag_("skip_db_check", "Manual override to skip the check if same settings for multiple search engines were applied. Only valid together with -multiple_search_engines flag.", true); + registerFlag_("concat", "Naive merging of PSMs from different search engines: concatenate multiple search results instead of merging on scan level. Only valid together with -multiple_search_engines flag.", true); + registerFlag_("impute", "Will instead of discarding all PSM not unanimously detected by all SE, impute missing values by their respective scores min/max observed. Only valid together with -multiple_search_engines flag.", true); + registerFlag_("limit_imputation", "Will impute missing scores with the worst numerical limit (instead of min/max observed) of the respective score. Only valid together with -multiple_search_engines flag.", true); + } + + ExitCodes main_(int, const char**) + { + //------------------------------------------------------------- + // general variables and data to perform PSMFeatureExtractor + //------------------------------------------------------------- + vector all_peptide_ids; + vector all_protein_ids; + + //------------------------------------------------------------- + // parsing parameters + //------------------------------------------------------------- + const StringList in_list = getStringList_("in"); + bool multiple_search_engines = getFlag_("multiple_search_engines"); + LOG_DEBUG << "Input file (of target?): " << ListUtils::concatenate(in_list, ",") << endl; + if (in_list.size() > 1 && !multiple_search_engines) + { + writeLog_("Fatal error: multiple input files given for -in, but -multiple_search_engines flag not specified. If the same search engine was used, feed the input files into PSMFeatureExtractor one by one."); + printUsage_(); + return ILLEGAL_PARAMETERS; + } + + const String mzid_out(getStringOption_("mzid_out")); + const String out(getStringOption_("out")); + if (mzid_out.empty() && out.empty()) + { + writeLog_("Fatal error: no output file given (parameter 'out' or 'mzid_out')"); + printUsage_(); + return ILLEGAL_PARAMETERS; + } + + //------------------------------------------------------------- + // read input + //------------------------------------------------------------- + bool skip_db_check = getFlag_("skip_db_check"); + bool concatenate = getFlag_("concat"); + StringList search_engines_used; + for (StringList::const_iterator fit = in_list.begin(); fit != in_list.end(); ++fit) + { + vector peptide_ids; + vector protein_ids; + String in = *fit; + FileHandler fh; + FileTypes::Type in_type = fh.getType(in); + LOG_INFO << "Loading input file: " << in << endl; + if (in_type == FileTypes::IDXML) + { + IdXMLFile().load(in, protein_ids, peptide_ids); + } + else if (in_type == FileTypes::MZIDENTML) + { + LOG_WARN << "Converting from mzid: possible loss of information depending on target format." << endl; + MzIdentMLFile().load(in, protein_ids, peptide_ids); + } + //else caught by TOPPBase:registerInput being mandatory mzid or idxml + + // will check if all ProteinIdentifications have the same search db unless it is the first, in which case all_protein_ids is empty yet. + if (multiple_search_engines && !skip_db_check && !all_protein_ids.empty()) + { + ProteinIdentification::SearchParameters all_search_parameters = all_protein_ids.front().getSearchParameters(); + ProteinIdentification::SearchParameters search_parameters = protein_ids.front().getSearchParameters(); + if (search_parameters.db != all_search_parameters.db) + { + writeLog_("Input files are not searched with the same protein database, " + search_parameters.db + " vs. " + all_search_parameters.db + ". Set -skip_db_check flag to ignore this. Aborting!"); + return INCOMPATIBLE_INPUT_DATA; + } + } + + if (!multiple_search_engines) + { + all_peptide_ids.insert(all_peptide_ids.end(), peptide_ids.begin(), peptide_ids.end()); + } + else + { + String search_engine = protein_ids.front().getSearchEngine(); + if (!ListUtils::contains(search_engines_used, search_engine)) + { + search_engines_used.push_back(search_engine); + } + + if (concatenate) + { + // will concatenate the list + PercolatorFeatureSetHelper::concatMULTISEPeptideIds(all_peptide_ids, peptide_ids, search_engine); + } + else + { + // will collapse the list (reference) + PercolatorFeatureSetHelper::mergeMULTISEPeptideIds(all_peptide_ids, peptide_ids, search_engine); + } + } + PercolatorFeatureSetHelper::mergeMULTISEProteinIds(all_protein_ids, protein_ids); + } + + if (all_protein_ids.empty()) + { + writeLog_("No protein hits found in input file. Aborting!"); + printUsage_(); + return INPUT_FILE_EMPTY; + } + + //------------------------------------------------------------- + // extract search engine and prepare pin + //------------------------------------------------------------- + String search_engine = all_protein_ids.front().getSearchEngine(); + if (multiple_search_engines) search_engine = "multiple"; + LOG_DEBUG << "Registered search engine: " << search_engine << endl; + + StringList extra_features = getStringList_("extra"); + StringList feature_set; + + if (search_engine == "multiple") + { + if (getFlag_("concat")) + { + PercolatorFeatureSetHelper::addCONCATSEFeatures(all_peptide_ids, search_engines_used, feature_set); + } + else + { + bool impute = getFlag_("impute"); + bool limits = getFlag_("limit_imputation"); + PercolatorFeatureSetHelper::addMULTISEFeatures(all_peptide_ids, search_engines_used, feature_set, !impute, limits); + } + } + else if (search_engine == "MS-GF+") PercolatorFeatureSetHelper::addMSGFFeatures(all_peptide_ids, feature_set); + else if (search_engine == "Mascot") PercolatorFeatureSetHelper::addMASCOTFeatures(all_peptide_ids, feature_set); + else if (search_engine == "XTandem") PercolatorFeatureSetHelper::addXTANDEMFeatures(all_peptide_ids, feature_set); + else if (search_engine == "Comet") PercolatorFeatureSetHelper::addCOMETFeatures(all_peptide_ids, feature_set); + else + { + writeLog_("No known input to create PSM features from. Aborting"); + return INCOMPATIBLE_INPUT_DATA; + } + + String run_identifier = all_protein_ids.front().getIdentifier(); + for (vector::iterator it = all_peptide_ids.begin(); it != all_peptide_ids.end(); ++it) + { + it->setIdentifier(run_identifier); + PercolatorFeatureSetHelper::checkExtraFeatures(it->getHits(), extra_features); // will remove inconsistently available features + } + + // TODO: There should only be 1 ProteinIdentification element in this vector, no need for a for loop + for (vector::iterator it = all_protein_ids.begin(); it != all_protein_ids.end(); ++it) + { + ProteinIdentification::SearchParameters search_parameters = it->getSearchParameters(); + + search_parameters.setMetaValue("feature_extractor", "TOPP_PSMFeatureExtractor"); + feature_set.insert(feature_set.end(), extra_features.begin(), extra_features.end()); + search_parameters.setMetaValue("extra_features", ListUtils::concatenate(feature_set, ",")); + it->setSearchParameters(search_parameters); + } + + // Storing the PeptideHits with calculated q-value, pep and svm score + if (!mzid_out.empty()) + { + MzIdentMLFile().store(mzid_out.toQString().toStdString(), all_protein_ids, all_peptide_ids); + } + if (!out.empty()) + { + IdXMLFile().store(out.toQString().toStdString(), all_protein_ids, all_peptide_ids); + } + + writeLog_("PSMFeatureExtractor finished successfully!"); + return EXECUTION_OK; + } + +}; + + +int main(int argc, const char** argv) +{ + PSMFeatureExtractor tool; + + return tool.main(argc, argv); +} + +/// @endcond diff --git a/src/utils/TopPerc.cpp b/src/utils/TopPerc.cpp deleted file mode 100644 index 07289124548..00000000000 --- a/src/utils/TopPerc.cpp +++ /dev/null @@ -1,913 +0,0 @@ -// -------------------------------------------------------------------------- -// OpenMS -- Open-Source Mass Spectrometry -// -------------------------------------------------------------------------- -// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, -// ETH Zurich, and Freie Universitaet Berlin 2002-2016. -// -// This software is released under a three-clause BSD license: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of any author or any participating institution -// may be used to endorse or promote products derived from this software -// without specific prior written permission. -// For a full list of authors, refer to the file AUTHORS. -// -------------------------------------------------------------------------- -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING -// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF -// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// -------------------------------------------------------------------------- -// $Maintainer: Mathias Walzer $ -// $Authors: Andreas Simon, Mathias Walzer $ -// -------------------------------------------------------------------------- - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include - -using namespace OpenMS; -using namespace std; - -//------------------------------------------------------------- -//Doxygen docu -//------------------------------------------------------------- - -/** - @page TOPP_TopPerc TopPerc - - @brief TopPerc facilitates the input to, the call of and output integration of Percolator. - Percolator (http://per-colator.com/) is a tool to apply semi-supervised learning for peptide - identification from shotgun proteomics datasets. - - @experimental This tool is work in progress and usage and input requirements might change. - -
- - - - - - - - - - -
potential predecessor tools \f$ \longrightarrow \f$ MSGF+\f$ \longrightarrow \f$ potential successor tools
@ref TOPP_IDFilter @ref TOPP_IDMapper
-
- -

Percolator is search engine sensitive, i.e. it's input has to vary, depending on the search engine.

- - The command line parameters of this tool are: - @verbinclude TOPP_TopPerc.cli - INI file documentation of this tool: - @htmlinclude TOPP_TopPerc.html - - Percolator is written by Lukas Käll (http://per-colator.com/ Copyright Lukas Käll ) -*/ - -// We do not want this class to show up in the docu: -/// @cond TOPPCLASSES - - -class TOPPPercolator : - public TOPPBase -{ -public: - TOPPPercolator() : - TOPPBase("TopPerc", "Facilitate input to Percolator and reintegrate.", false) - { - } - -protected: - void preparePIN(vector& peptide_ids, bool is_decoy, TextFile& txt, int minCharge, int maxCharge) - { - char out_sep = '\t'; - for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) - { - for (vector::const_iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) - { - // Some Hits have no NumMatchedMainIons, and MeanError, etc. values. Have to ignore them! - if (hit->metaValueExists("NumMatchedMainIons")) - { - // only take features from first ranked entries and only with meanerrortop7 != 0.0 - if (hit->getRank() == 1 && hit->getMetaValue("MeanErrorTop7").toString().toDouble() != 0.0) - { - int rank = hit->getRank(); - int charge = hit->getCharge(); - - String spec_ref = it->getMetaValue("spectrum_id").toQString().toStdString(); //TODO consider other spectraIDFormats or keep index only in metavalue - vector scan_id; - spec_ref.split("scan=", scan_id); - - int label = 1; - String SpecId = "target_SII_"; - if (is_decoy) - { - SpecId = "decoy_SII_"; - label = -1; - } - - SpecId += scan_id[1] + "_" + String(rank) + "_" + scan_id[1] + "_" + String(charge) + "_" + String(rank); - - double rawScore = hit->getMetaValue("MS:1002049").toString().toDouble(); - double denovoScore = hit->getMetaValue("MS:1002050").toString().toDouble(); - - double scoreRatio; - if (denovoScore > 0) - { - scoreRatio = (rawScore / denovoScore); - } - else - { - scoreRatio = rawScore * 10000; - } - - double energy = denovoScore - rawScore; - double ln_eval = -log(hit->getMetaValue("MS:1002053").toString().toDouble()); - int isotopeError = hit->getMetaValue("IsotopeError").toString().toInt(); - double lnExplainedIonCurrentRatio = log(hit->getMetaValue("ExplainedIonCurrentRatio").toString().toDouble() + 0.0001); // @andsi: wtf?! - double lnNTermIonCurrentRatio = log(hit->getMetaValue("NTermIonCurrentRatio").toString().toDouble() + 0.0001); // @andsi: wtf?! - double lnCTermIonCurrentRatio = log(hit->getMetaValue("CTermIonCurrentRatio").toString().toDouble() + 0.0001); // @andsi: wtf?! - double lnMS2IonCurrent = log(hit->getMetaValue("MS2IonCurrent").toString().toDouble()); - double expMass = it->getMZ(); - double calcMass = hit->getMetaValue("calcMZ"); - int pepLen = hit->getSequence().toString().length(); - double dM = (expMass - (isotopeError * Constants::NEUTRON_MASS_U / charge) - calcMass) / expMass; - double absdM = abs(dM); - - - double meanErrorTop7 = hit->getMetaValue("MeanErrorTop7").toString().toDouble(); - int NumMatchedMainIons = hit->getMetaValue("NumMatchedMainIons").toString().toInt(); - double stdevErrorTop7 = 0.0; - - if (hit->getMetaValue("StdevErrorTop7").toString() != "NaN") - { - stdevErrorTop7 = hit->getMetaValue("StdevErrorTop7").toString().toDouble(); - if (stdevErrorTop7 == 0.0) - { - stdevErrorTop7 = meanErrorTop7; - } - } - else - { - LOG_WARN << "Stdeverrortop7 is NaN" << endl; - } - - meanErrorTop7 = rescaleFragmentFeature(meanErrorTop7, NumMatchedMainIons); - double sqMeanErrorTop7 = rescaleFragmentFeature(meanErrorTop7 * meanErrorTop7, NumMatchedMainIons); - stdevErrorTop7 = rescaleFragmentFeature(stdevErrorTop7, NumMatchedMainIons); - - // write 1 for the correct charge, 0 for other charges - // i.e.: charge 3 for charges from 2-5: 0 1 0 0 - stringstream ss; - int i = minCharge; - while (i <= maxCharge) - { - if (charge != i) - { - ss << "0" << out_sep; - } - if (charge == i) - { - ss << "1" << out_sep; - } - i++; - } - char aaBefore = hit->getPeptideEvidences().front().getAABefore(); - char aaAfter = hit->getPeptideEvidences().front().getAAAfter(); - - // sequence without modification: "ABC" instead of "ABC[UNIMOD:4]" - String peptide_without_modifications = aaBefore + string(".") + hit->getSequence().toUnmodifiedString() + string(".") + aaAfter; - - // formula taken from percolator msgfplus-converter isEnz(n, c) for trypsin - bool enzN = isEnz(peptide_without_modifications.at(0), peptide_without_modifications.at(2), getStringOption_("enzyme")); - bool enzC = isEnz(peptide_without_modifications.at(peptide_without_modifications.size() - 3), peptide_without_modifications.at(peptide_without_modifications.size() - 1), getStringOption_("enzyme")); - int enzInt = countEnzymatic(hit->getSequence().toUnmodifiedString(), getStringOption_("enzyme")); - - String peptide_with_modifications = aaBefore + string(".") + hit->getSequence().toString() + string(".") + aaAfter; - String protein = hit->getPeptideEvidences().front().getProteinAccession(); - - // One PeptideSpectrumHit with all its features - String lis = SpecId + out_sep + String(label) + out_sep + scan_id[1] + out_sep + (String)rawScore + out_sep + - (String)denovoScore + out_sep + (String)scoreRatio + out_sep + (String)energy + out_sep + (String)ln_eval + - out_sep + (String)isotopeError + out_sep + (String)lnExplainedIonCurrentRatio + out_sep + - (String)lnNTermIonCurrentRatio + out_sep + (String)lnCTermIonCurrentRatio + out_sep + (String)lnMS2IonCurrent - + out_sep + (String)expMass + out_sep + (String)pepLen + out_sep + (String)dM + out_sep + (String)absdM + out_sep + - (String)meanErrorTop7 + out_sep + (String)sqMeanErrorTop7 + out_sep + (String)stdevErrorTop7 + - out_sep + String(ss.str()) + String(enzN) + out_sep + String(enzC) + out_sep + String(enzInt) + out_sep + - peptide_with_modifications + out_sep + protein + out_sep; - - // peptide Spectrum Hit pushed to the output file - txt.addLine(lis); - } - } - } - } - } - - // Function taken from Enzyme.h from Percolator - bool isEnz(const char& n, const char& c, string enz) - { - if (enz == "trypsin") - { - return ((n == 'K' || n == 'R') && c != 'P') || n == '-' || c == '-'; - } - else if (enz == "chymotrypsin") - { - return ((n == 'F' || n == 'W' || n == 'Y' || n == 'L') && c != 'P') || n == '-' || c == '-'; - } - else if (enz == "thermolysin") - { - return ((c == 'A' || c == 'F' || c == 'I' || c == 'L' || c == 'M' - || c == 'V' || (n == 'R' && c == 'G')) && n != 'D' && n != 'E') || n == '-' || c == '-'; - } - else if (enz == "proteinasek") - { - return (n == 'A' || n == 'E' || n == 'F' || n == 'I' || n == 'L' - || n == 'T' || n == 'V' || n == 'W' || n == 'Y') || n == '-' || c == '-'; - } - else if (enz == "pepsin") - { - return ((c == 'F' || c == 'L' || c == 'W' || c == 'Y' || n == 'F' - || n == 'L' || n == 'W' || n == 'Y') && n != 'R') || n == '-' || c == '-'; - } - else if (enz == "elastase") - { - return ((n == 'L' || n == 'V' || n == 'A' || n == 'G') && c != 'P') - || n == '-' || c == '-'; - } - else if (enz == "lys-n") - { - return (c == 'K') - || n == '-' || c == '-'; - } - else if (enz == "lys-c") - { - return ((n == 'K') && c != 'P') - || n == '-' || c == '-'; - } - else if (enz == "arg-c") - { - return ((n == 'R') && c != 'P') - || n == '-' || c == '-'; - } - else if (enz == "asp-n") - { - return (c == 'D') - || n == '-' || c == '-'; - } - else if (enz == "glu-c") - { - return ((n == 'E') && (c != 'P')) - || n == '-' || c == '-'; - } - else - { - return true; - } - } - - // Function taken from Enzyme.h from Percolator - size_t countEnzymatic(String peptide, string enz) - { - size_t count = 0; - for (size_t ix = 1; ix < peptide.size(); ++ix) - { - if (isEnz(peptide[ix - 1], peptide[ix], enz)) - { - ++count; - } - } - return count; - } - - // Function taken from the percolator converter MsgfplusReader - double rescaleFragmentFeature(double featureValue, int NumMatchedMainIons) - { - // Rescale the fragment features to penalize features calculated by few ions - int numMatchedIonLimit = 7; - int numerator = (1 + numMatchedIonLimit) * (1 + numMatchedIonLimit); - int denominator = (1 + (min)(NumMatchedMainIons, numMatchedIonLimit)) * (1 + (min)(NumMatchedMainIons, numMatchedIonLimit)); - return featureValue * ((double)numerator / denominator); - } - - void registerOptionsAndFlags_() - { - registerInputFile_("percolator_executable", "", "", "Path to the percolator binary", true, false, ListUtils::create("skipexists")); - registerInputFile_("in_target", "", "", "Input target file"); - registerInputFile_("in_decoy", "", "", "Input decoy file"); - setValidFormats_("in_target", ListUtils::create("mzid")); - setValidFormats_("in_decoy", ListUtils::create("mzid")); - - registerOutputFile_("out", "", "", "Output file", true); - registerStringOption_("enzyme", "", "trypsin", "Type of enzyme: no_enzyme,elastase,pepsin,proteinasek,thermolysin,chymotrypsin,lys-n,lys-c,arg-c,asp-n,glu-c,trypsin", false); - -// registerOutputFile_("r", "", "out", "Output tab delimited results to a file instead of stdout", false); - registerOutputFile_("X", "", "", "path to file in xml-output format (pout). Default is: pout.tab", false); - registerFlag_("e", "read xml-input format (pin) from standard input"); - registerFlag_("Z", "Include decoys (PSMs, peptides and/or proteins) in the xml-output. Only available if -X is used."); - registerDoubleOption_("p", "", 0.0, "Cpos, penalty for mistakes made on positive examples. Set by cross validation if not specified.", false); - registerDoubleOption_("n", "", 0.0, "Cneg, penalty for mistakes made on negative examples. Set by cross validation if not specified.", false); - registerDoubleOption_("F", "", 0.01, "False discovery rate threshold to define positive examples in training. Set by cross validation if 0. Default is 0.01.", false); - registerDoubleOption_("t", "", 0.01, "False discovery rate threshold for evaluating best cross validation result and the reported end result. Default is 0.01.", false); - registerIntOption_("i", "", 0, "Maximal number of iterations", false); - registerFlag_("x", "Quicker execution by reduced internal cross-validation."); - registerDoubleOption_("f", "", 0.6, "Fraction of the negative data set to be used as train set when only providing one negative set, remaining examples will be used as test set. Set to 0.6 by default.", false); - registerOutputFile_("J", "", "", "Output the computed features to the given file in tab-delimited format. A file with the features with the given file name will be created", false); - registerInputFile_("k", "", "", "Input file given in the deprecated pin-xml format generated by e.g. sqt2pin with the -k option", false); - registerOutputFile_("w", "", "", "Output final weights to the given file", false); - registerInputFile_("W", "", "", "Read initial weights to the given file", false); - registerStringOption_("V", "", "", "The most informative feature given as the feature name, can be negated to indicate that a lower value is better.", false); - registerIntOption_("v", "", 2, "Set verbosity of output: 0=no processing info, 5=all, default is 2", false); - registerFlag_("u", "Use unit normalization [0-1] instead of standard deviation normalization"); - registerFlag_("R", "Measure performance on test set each iteration"); - registerFlag_("O", "Override error check and do not fall back on default score vector in case of suspect score vector"); - registerIntOption_("S", "", 1, "Setting seed of the random number generator. Default value is 1", false); - registerFlag_("K", "Retention time features calculated as in Klammer et al."); - registerFlag_("D", "Include description of correct features"); - registerOutputFile_("B", "", "", "Output tab delimited results for decoys into a file", false); - registerFlag_("U", "Do not remove redundant peptides, keep all PSMS and exclude peptide level probabilities."); - registerFlag_("s", "skip validation of input file against xml schema"); - registerFlag_("A", "output protein level probabilities"); - registerDoubleOption_("a", "", 0.0, "Probability with which a present protein emits an associated peptide (to be used jointly with the -A option). Set by grid search if not specified.", false); - registerDoubleOption_("b", "", 0.0, "Probability of the creation of a peptide from noise (to be used jointly with the -A option). Set by grid search if not specified", false); - registerDoubleOption_("G", "", 0.0, "Prior probability of that a protein is present in the sample ( to be used with the -A option). Set by grid search if not specified", false); - registerFlag_("g", "treat ties as if it were one protein (Only valid if option -A is active)."); - registerFlag_("I", "use pi_0 value when calculating empirical q-values (no effect if option Q is activated) (Only valid if option -A is active)."); - registerFlag_("q", "output empirical q-values and p-values (from target-decoy analysis) (Only valid if option -A is active)."); - registerFlag_("N", "disactivates the grouping of proteins with similar connectivity, for example if proteins P1 and P2 have the same peptides matching both of them, P1 and P2 will not be grouped as one protein (Only valid if option -A is active)."); - registerFlag_("E", "Proteins graph will not be separated in sub-graphs (Only valid if option -A is active)."); - registerFlag_("C", "it does not prune peptides with a very low score (~0.0) which means that if a peptide with a very low score is matching two proteins, when we prune the peptide,it will be duplicated to generate two new protein groups (Only valid if option -A is active)."); - registerIntOption_("d", "", 0, "Setting depth 0 or 1 or 2 from low depth to high depth(less computational time) of the grid search for the estimation Alpha,Beta and Gamma parameters for fido(Only valid if option -A is active). Default value is 0", false); - registerStringOption_("P", "", "random", "Define the text pattern to identify the decoy proteins and/or PSMs, set this up if the label that identifies the decoys in the database is not the default (by default : random) (Only valid if option -A is active).", false); - registerFlag_("T", "Reduce the tree of proteins (removing low scored proteins) in order to estimate alpha,beta and gamma faster.(Only valid if option -A is active)."); - registerFlag_("Y", "Use target decoy competition to compute peptide probabilities.(recommended when using -A)."); - registerFlag_("H", "Q-value threshold that will be used in the computation of the MSE and ROC AUC score in the grid search (recommended 0.05 for normal size datasets and 0.1 for big size datasets).(Only valid if option -A is active)."); - registerFlag_("fido-truncation", "Proteins with a very low score (< 0.001) will be truncated (assigned 0.0 probability).(Only valid if option -A is active)"); - registerFlag_("Q", "Uses protein group level inference, each cluster of proteins is either present or not, therefore when grouping proteins discard all possible combinations for each group.(Only valid if option -A is active and -N is inactive)."); - } - - ExitCodes main_(int, const char**) - { - //------------------------------------------------------------- - // general variables and data to perform TopPerc - //------------------------------------------------------------- - vector peptide_ids; - vector protein_ids; - vector peptide_ids_d; - vector protein_ids_d; - - //------------------------------------------------------------- - // parsing parameters and crashing if mandatory parameters are missing - //------------------------------------------------------------- - String inputfile_target_name = getStringOption_("in_target").toQString().toStdString(); - writeDebug_(String("Input file of target: ") + inputfile_target_name, 1); - if (inputfile_target_name == "") - { - writeLog_("No target input file specified. Aborting!"); - printUsage_(); - return ILLEGAL_PARAMETERS; - } - - String inputfile_decoy_name = getStringOption_("in_decoy").toQString().toStdString(); - writeDebug_(String("Input file of decoy: ") + inputfile_decoy_name, 1); - if (inputfile_decoy_name == "") - { - writeLog_("No decoy input file specified. Aborting!"); - printUsage_(); - return ILLEGAL_PARAMETERS; - } - - String percolator_executable(getStringOption_("percolator_executable")); - writeDebug_(String("Path to the percolator: ") + percolator_executable, 1); - if (percolator_executable == "") //TODO TOPPBase::findExecutable - { - writeLog_("No path to percolator specified. Aborting!"); - printUsage_(); - return ILLEGAL_PARAMETERS; - } - - // get the file extension of the input files to start the correct converter - vector input_target_file; - vector input_decoy_file; - inputfile_target_name.split('.', input_target_file); - String data_target = input_target_file[input_target_file.size() - 1]; - inputfile_decoy_name.split('.', input_decoy_file); - String data_decoy = input_decoy_file[input_decoy_file.size() - 1]; - - TextFile txt; - char out_sep = '\t'; - - //get Information about database search - String datab = ""; - - // converter for MSGF+ & Mascot Files - if (data_target == "mzid" && data_decoy == "mzid") - { - datab = "MSGF+"; - - // TODO FOR FUTURE DEVELOPMENT: check out without explicit parameter setting if input file is target or decoy!!! - // Both input files are read in - MzIdentMLFile().load(inputfile_target_name, protein_ids, peptide_ids); - MzIdentMLFile().load(inputfile_decoy_name, protein_ids_d, peptide_ids_d); - LOG_INFO << "Using IDs from" << protein_ids.back().getSearchEngine() << endl; - - // Open File and check if the Identifier is MSGF+ - if (peptide_ids.front().getIdentifier() == "MS-GF+" && peptide_ids_d.front().getIdentifier() == "MS-GF+") - { - - // Find out how many possible charges are available - int maxCharge = 0; - int minCharge = 10; - for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) - { - for (vector::const_iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) - { - if (hit->getCharge() > maxCharge) - { - maxCharge = hit->getCharge(); - } - if (hit->getCharge() < minCharge) - { - minCharge = hit->getCharge(); - } - } - } - - // Create String of the charges for the header of the tab file - stringstream ss; - ss << "Charge" << minCharge << ", "; - for (int j = minCharge + 1; j < maxCharge + 1; j++) - { - ss << "Charge" << j << ","; - } - - // Create header for the features - string featureset = "SpecId, Label,ScanNr, RawScore, DeNovoScore,ScoreRatio, Energy,lnEValue,IsotopeError, lnExplainedIonCurrentRatio,lnNTermIonCurrentRatio,lnCTermIonCurrentRatio,lnMS2IonCurrent,Mass,PepLen,dM,absdM,MeanErrorTop7,sqMeanErrorTop7,StdevErrorTop7," + ss.str() + "enzN,enzC,enzInt,Peptide,Proteins"; - StringList txt_header0 = ListUtils::create(featureset); - txt.addLine(ListUtils::concatenate(txt_header0, out_sep)); - LOG_INFO << "consuming target file" << endl; - // get all the features from the target file - preparePIN(peptide_ids, false, txt, minCharge, maxCharge); - LOG_INFO << "consuming decoy file" << endl; - // get all the features from the decoy file - preparePIN(peptide_ids_d, true, txt, minCharge, maxCharge); - } - else if (peptide_ids.front().getIdentifier() == "Mascot" && peptide_ids_d.front().getIdentifier() == "Mascot") - { - // TODO: Mascot Implementation - } - } - // converter for XTandem-Files - // TODO IN FUTURE DEVELOPMENT: IMPLEMENT MZID READER FOR XTANDEMFILES - else if (data_target == "idXML" && data_decoy == "idXML") - { - datab = "XTANDEM"; - IdXMLFile file; - IdXMLFile decoy_file; - file.load(getStringOption_("in_target"), protein_ids, peptide_ids); - decoy_file.load(getStringOption_("in_decoy"), protein_ids_d, peptide_ids_d); - - // Find out how many possible charges are available - int maxCharge = 0; - int minCharge = 10; - - for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) - { - for (vector::const_iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) - { - if (hit->getCharge() > maxCharge) - { - maxCharge = hit->getCharge(); - } - if (hit->getCharge() < minCharge) - { - minCharge = hit->getCharge(); - } - } - } - - // Create String of the charges for the header of the tab file - stringstream ss; - ss << "Charge" << minCharge << ", "; - for (int j = minCharge + 1; j < maxCharge + 1; j++) - { - - ss << "Charge" << j << ","; - } - - // Find out which ions are in XTandem-File and take only these as features - stringstream ss_ion; - if (peptide_ids.front().getHits().front().getMetaValue("a_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("a_ions").toString() != "") - { - ss_ion << "frac_ion_a" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("b_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("b_ions").toString() != "") - { - ss_ion << "frac_ion_b" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("c_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("c_ions").toString() != "") - { - ss_ion << "frac_ion_c" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("x_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("x_ions").toString() != "") - { - ss_ion << "frac_ion_x" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("y_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("y_ions").toString() != "") - { - ss_ion << "frac_ion_y" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("z_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("z_ions").toString() != "") - { - ss_ion << "frac_ion_z" << ","; - } - - // Create header for the features - String featureset = "SpecId,Label,ScanNr,hyperscore,deltascore," + ss_ion.str() + - ",Mass,dM,absdM,PepLen," + ss.str() + "enzN,enzC,enzInt,Peptide,Proteins"; - StringList txt_header0 = ListUtils::create(featureset); - // Insert the header with the features names to the file - txt.addLine(ListUtils::concatenate(txt_header0, out_sep)); - - LOG_INFO << "read in target file" << endl; - // get all the features from the target file - for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) - { - if (it->isHigherScoreBetter()) - { - //TODO this must be spectrum_reference!!! parse spectrum number from there if necessary! - String scannumber = String(it->getMetaValue("spectrum_id")); - int charge = it->getHits().front().getCharge(); - int label = 1; - double hyperscore = it->getHits().front().getScore(); - // deltascore = hyperscore - nextscore - double deltascore = hyperscore - it->getHits().front().getMetaValue("nextscore").toString().toDouble(); - String sequence = it->getHits().front().getSequence().toString(); - int length = sequence.length(); - - // Find out correct ion types and get its Values - stringstream ss_ion_2; - - if (it->getHits().front().getMetaValue("a_score").toString() != "" && - it->getHits().front().getMetaValue("a_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("a_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("b_score").toString() != "" && - it->getHits().front().getMetaValue("b_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("b_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("c_score").toString() != "" && - it->getHits().front().getMetaValue("c_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("c_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("x_score").toString() != "" && - it->getHits().front().getMetaValue("x_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("x_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("y_score").toString() != "" && - it->getHits().front().getMetaValue("y_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("y_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("z_score").toString() != "" && - it->getHits().front().getMetaValue("z_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("z_ions")) / length << out_sep; - } - double mass = it->getHits().front().getMetaValue("mass"); - double dm = it->getHits().front().getMetaValue("delta"); - double mh = mass + dm; - double absdM = abs(dm); - - // write 1 for the correct charge, 0 for other charges - // i.e.: charge 3 for charges from 2-5: 0 1 0 0 - stringstream ss; - int i = minCharge; - while (i <= maxCharge) - { - if (charge != i) - { - ss << "0" << out_sep; - } - if (charge == i) - { - ss << "1" << out_sep; - } - i++; - } - - char aaBefore = it->getHits().front().getPeptideEvidences().front().getAABefore(); - char aaAfter = it->getHits().front().getPeptideEvidences().front().getAAAfter(); - - String peptide = aaBefore + string(".") + sequence + string(".") + aaAfter; - - // formula taken from percolator converter isEnz(n, c) for trypsin - bool enzN = isEnz(peptide.at(0), peptide.at(2), getStringOption_("enzyme")); - bool enzC = isEnz(peptide.at(peptide.size() - 3), peptide.at(peptide.size() - 1), getStringOption_("enzyme")); - int enzInt = countEnzymatic(sequence, getStringOption_("enzyme")); - String protein = it->getHits().front().getPeptideEvidences().front().getProteinAccession(); - - // One PeptideSpectrumHit with all its features - String lis = "_tandem_output_file_target_" + scannumber + "_" + String(charge) + - "_1" + out_sep + String(label) + out_sep + scannumber + out_sep + String(hyperscore) + - out_sep + String(deltascore) + out_sep + ss_ion_2.str() + String(mh) + out_sep + - String(dm) + out_sep + String(absdM) + out_sep + String(length) + out_sep + String(ss.str()) + - String(enzN) + out_sep + String(enzC) + out_sep + String(enzInt) + out_sep + peptide + out_sep + protein; - - // peptide Spectrum Hit pushed to the output file - txt.addLine(lis); - } - } - - LOG_INFO << "read in decoy file" << endl; - // get all the features from the decoy file - for (vector::iterator it = peptide_ids_d.begin(); it != peptide_ids_d.end(); ++it) - { - if (it->isHigherScoreBetter()) - { - String scannumber = String(it->getMetaValue("spectrum_id")); - int charge = it->getHits().front().getCharge(); - int label = -1; - double hyperscore = it->getHits().front().getScore(); - // deltascore = hyperscore - nextscore - double deltascore = hyperscore - it->getHits().front().getMetaValue("nextscore").toString().toDouble(); - String sequence = it->getHits().front().getSequence().toString(); - int length = sequence.length(); - - // Find out correct ion types and get its Values - stringstream ss_ion_2; - - if (it->getHits().front().getMetaValue("a_score").toString() != "" && it->getHits().front().getMetaValue("a_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("a_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("b_score").toString() != "" && it->getHits().front().getMetaValue("b_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("b_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("c_score").toString() != "" && it->getHits().front().getMetaValue("c_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("c_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("x_score").toString() != "" && it->getHits().front().getMetaValue("x_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("x_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("y_score").toString() != "" && it->getHits().front().getMetaValue("y_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("y_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("z_score").toString() != "" && it->getHits().front().getMetaValue("z_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("z_ions")) / length; - } - double mass = it->getHits().front().getMetaValue("mass"); - double dm = double(it->getHits().front().getMetaValue("delta")); - double mh = mass + dm; - double absdM = abs(dm); - - // write 1 for the correct charge, 0 for other charges - // i.e: charge 3 for charges from 2-5: 0 1 0 0 - stringstream ss; - int i = minCharge; - while (i <= maxCharge) - { - if (charge != i) - { - ss << "0" << out_sep; - } - if (charge == i) - { - ss << "1" << out_sep; - } - i++; - } - - char aaBefore = it->getHits().front().getPeptideEvidences().front().getAABefore(); - char aaAfter = it->getHits().front().getPeptideEvidences().front().getAAAfter(); - - String peptide = aaBefore + string(".") + sequence + string(".") + aaAfter; - - // formula taken from percolator converter isEnz(n, c) for trypsin - bool enzN = isEnz(peptide.at(0), peptide.at(2), getStringOption_("enzyme")); - bool enzC = isEnz(peptide.at(peptide.size() - 3), peptide.at(peptide.size() - 1), getStringOption_("enzyme")); - int enzInt = countEnzymatic(sequence, getStringOption_("enzyme")); - String protein = it->getHits().front().getPeptideEvidences().front().getProteinAccession(); - - // One PeptideSpectrumHit with all its features - String lis = "_tandem_output_file_decoy_" + scannumber + "_" + String(charge) + "_1" + out_sep + String(label) + out_sep + scannumber + out_sep + String(hyperscore) + out_sep + String(deltascore) + out_sep + ss_ion_2.str() + out_sep - + String(mh) + out_sep + String(dm) + out_sep + String(absdM) + out_sep + String(length) + out_sep + ss.str() + out_sep + String(enzN) + out_sep + String(enzC) + out_sep + String(enzInt) + out_sep + peptide + out_sep + protein; - - // peptide Spectrum Hit pushed to the output file - txt.addLine(lis); - } - } - } - else - { - LOG_INFO << "target and decoy files are not of the same type" << endl; - } - - LOG_INFO << "Executing percolator" << endl; - - // create temp directory to store percolator in file pin.tab temporarily - QDir qdir_temp(File::getTempDirectory().toQString()); - String temp_data_directory = File::getUniqueName(); - qdir_temp.mkdir(temp_data_directory.toQString()); - qdir_temp.cd(temp_data_directory.toQString()); - temp_data_directory = File::getTempDirectory() + "/" + temp_data_directory; - String in_file = temp_data_directory + "/" + File::getUniqueName() + ".tab"; - String out_file = temp_data_directory + "/" + File::getUniqueName() + ".tab"; - - // File is stored in temp directory - txt.store(in_file); - - QProcess process; - QStringList arguments; - - // Check all set parameters and get them into arguments StringList - arguments << "-r" << out_file.toQString(); - if (getFlag_("e")) arguments << "-e"; - if (getFlag_("Z")) arguments << "-Z"; - if (getDoubleOption_("p") != 0.0) arguments << "-p" << String(getDoubleOption_("p")).toQString(); - if (getDoubleOption_("n") != 0.0) arguments << "-n" << String(getDoubleOption_("n")).toQString(); - if (getDoubleOption_("F") != 0.01) arguments << "-F" << String(getDoubleOption_("F")).toQString(); - if (getDoubleOption_("t") != 0.01) arguments << "-t" << String(getDoubleOption_("t")).toQString(); - if (getIntOption_("i") != 0) arguments << "-i" << String(getIntOption_("i")).toQString(); - if (getFlag_("x")) arguments << "-x"; - if (getDoubleOption_("f") != 0.6) arguments << "-f" << String(getDoubleOption_("f")).toQString(); - if (getStringOption_("J") != "") arguments << "-J" << getStringOption_("J").toQString(); - if (getStringOption_("k") != "") arguments << "-k" << getStringOption_("k").toQString(); - if (getStringOption_("w") != "") arguments << "-w" << getStringOption_("w").toQString(); - if (getStringOption_("W") != "") arguments << "-W" << getStringOption_("W").toQString(); - if (getStringOption_("V") != "") arguments << "-V" << getStringOption_("V").toQString(); - if (getIntOption_("v") != 2) arguments << "-v" << String(getIntOption_("v")).toQString(); - if (getFlag_("u")) arguments << "-u"; - if (getFlag_("R")) arguments << "-R"; - if (getFlag_("O")) arguments << "-O"; - if (getIntOption_("S") != 1) arguments << "-S" << String(getDoubleOption_("S")).toQString(); - if (getFlag_("K")) arguments << "-K"; - if (getFlag_("D")) arguments << "-D"; - if (getStringOption_("B") != "") arguments << "-B" << getStringOption_("B").toQString(); - if (getFlag_("U")) arguments << "-U"; - if (getFlag_("s")) arguments << "-s"; - if (getFlag_("A")) arguments << "-A"; - if (getDoubleOption_("a") != 0.0) arguments << "-a" << String(getDoubleOption_("a")).toQString(); - if (getDoubleOption_("b") != 0.0) arguments << "-b" << String(getDoubleOption_("b")).toQString(); - if (getDoubleOption_("G") != 0.0) arguments << "-G" << String(getDoubleOption_("G")).toQString(); - if (getFlag_("g")) arguments << "-g"; - if (getFlag_("I")) arguments << "-I"; - if (getFlag_("q")) arguments << "-q"; - if (getFlag_("N")) arguments << "-N"; - if (getFlag_("E")) arguments << "-E"; - if (getFlag_("C")) arguments << "-C"; - if (getIntOption_("d") != 0) arguments << "-d" << String(getIntOption_("d")).toQString(); - if (getStringOption_("P") != "random") arguments << "-P" << getStringOption_("P").toQString(); - if (getFlag_("T")) arguments << "-T"; - if (getFlag_("Y")) arguments << "-Y"; - if (getFlag_("H")) arguments << "-H"; - if (getFlag_("fido-truncation")) arguments << "--fido-truncation"; - if (getFlag_("Q")) arguments << "-Q"; - arguments << "-U"; - arguments << in_file.toQString(); - - // Percolator execution with the executable ant the arguments StringList - process.execute(percolator_executable.toQString(), arguments); // does automatic escaping etc... - - // when percolator finished calculation, it stores the results -r option (with or without -U) or -m (which seems to be not working) - CsvFile csv_file(out_file, '\t'); - - map > pep_map; - StringList row; - - for (UInt i = 1; i < csv_file.rowCount(); ++i) - { - csv_file.getRow(i, row); - vector row_values; - // peptide - row_values.push_back(row[4].chop(2).reverse().chop(2).reverse()); - // SVM-score - row_values.push_back(row[1]); - // Q-Value - row_values.push_back(row[2]); - // PEP - row_values.push_back(row[3]); - - vector substr; - row[0].split('_', substr); - pep_map[substr[2]] = row_values; // scannr. as written in preparePIN - } - - - // Add the percolator results to the peptide vector of the original input file - for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) - { - String sid = it->getMetaValue("spectrum_id"); - if (pep_map.find(sid) == pep_map.end()) - { - vector sr; - sid.split('=', sr); - sid = sr.back(); - if (pep_map.find(sid) == pep_map.end()) - { - //no spectrum found - log? - continue; - } - } - - it->setScoreType("q-value"); - it->setHigherScoreBetter(false); - vector temp; - swap(temp, it->getHits()); - for (vector::iterator hit = temp.begin(); hit != temp.end(); ++hit) - { - AASequence aat; - aat.fromString(pep_map[sid][0]); - if (hit->getSequence() == aat) - { - //get aa before/after/charge and metainfo - hit->setMetaValue("MS:1001492", pep_map[sid][1].toDouble()); //svm score - double qv = pep_map[sid][2].toDouble(); // q-value - hit->setMetaValue("MS:1001491", qv); - hit->setScore(qv); - hit->setMetaValue("MS:1001493", pep_map[sid][3].toDouble()); //pep - hit->setSequence(aat); - it->insertHit(*hit); - } - } - // TODO what with those not in percolator result file -> empty PeptideHit vector? - } - - for (vector::iterator it = protein_ids.begin(); it != protein_ids.end(); ++it) - { - it->setSearchEngine("Percolator"); - } - - // Storing the PeptideHits with calculated q-value, pep and svm score - MzIdentMLFile().store(getStringOption_("out").toQString().toStdString(), protein_ids, peptide_ids); - - LOG_INFO << "TopPerc finished successfully!" << endl; - - // As the percolator output file is not needed anymore, the temporary directory is going to be deleted -// File::removeDirRecursively(temp_data_directory); - - return EXECUTION_OK; - } - -}; - - -int main(int argc, const char** argv) -{ - TOPPPercolator tool; - - return tool.main(argc, argv); -} - -/// @endcond diff --git a/src/utils/executables.cmake b/src/utils/executables.cmake index 9d8d6274d70..fdb6590c11c 100644 --- a/src/utils/executables.cmake +++ b/src/utils/executables.cmake @@ -4,6 +4,7 @@ set(directory source/APPLICATIONS/UTILS) ### list all filenames of the directory here set(UTILS_executables AccurateMassSearch +COMETAdapter CVInspector DecoyDatabase DatabaseFilter @@ -32,6 +33,7 @@ MultiplexResolver MzMLSplitter OpenMSInfo PeakPickerIterative +PSMFeatureExtractor QCCalculator QCEmbedder QCExporter @@ -51,7 +53,6 @@ SpectraSTSearchAdapter SvmTheoreticalSpectrumGeneratorTrainer TICCalculator TransformationEvaluation -TopPerc XMLValidator )