From 507a06b7d29c5e92ac196ccfba76e409901faf30 Mon Sep 17 00:00:00 2001 From: Leon Bichmann Date: Wed, 8 Mar 2017 09:37:33 +0100 Subject: [PATCH 01/41] [Feature] add COMETAdapter --- src/utils/COMETAdapter.cpp | 594 ++++++++++++++++++++++++++++++++++++ src/utils/executables.cmake | 1 + 2 files changed, 595 insertions(+) create mode 100755 src/utils/COMETAdapter.cpp diff --git a/src/utils/COMETAdapter.cpp b/src/utils/COMETAdapter.cpp new file mode 100755 index 00000000000..803916591bd --- /dev/null +++ b/src/utils/COMETAdapter.cpp @@ -0,0 +1,594 @@ +// -------------------------------------------------------------------------- +// OpenMS -- Open-Source Mass Spectrometry +// -------------------------------------------------------------------------- +// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, +// ETH Zurich, and Freie Universitaet Berlin 2002-2016. +// +// This software is released under a three-clause BSD license: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of any author or any participating institution +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// For a full list of authors, refer to the file AUTHORS. +// -------------------------------------------------------------------------- +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING +// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// -------------------------------------------------------------------------- +// $Maintainer: Chris Bielow $ +// $Authors: Andreas Bertsch, Chris Bielow $ +// -------------------------------------------------------------------------- + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include // std::cout, std::ostream, std::ios + +#include + +using namespace OpenMS; +using namespace std; + +//------------------------------------------------------------- +//Doxygen docu +//------------------------------------------------------------- + +/** + @page TOPP_XTandemAdapter XTandemAdapter + + @brief Identifies peptides in MS/MS spectra via XTandem. + +
+ + + + + + + + + + +
pot. predecessor tools \f$ \longrightarrow \f$ XTandemAdapter \f$ \longrightarrow \f$ pot. successor tools
any signal-/preprocessing tool @n (in mzML format) @ref TOPP_IDFilter or @n any protein/peptide processing tool
+
+ + @em X! Tandem must be installed before this wrapper can be used. This wrapper + has been successfully tested with several versions of X! Tandem. + The last known version to work is 2009-04-01. We encountered problems with + later versions (namely 2010-01-01). + + To speed up computations, FASTA databases can be compressed using the fasta_pro.exe + tool of @em X! Tandem. It is contained in the "bin" folder of the @em X! Tandem installation. + Refer to the docu of @em X! Tandem for further information about settings. + + This adapter supports relative database filenames, which (when not found in the current working directory) is looked up in + the directories specified by 'OpenMS.ini:id_db_dir' (see @subpage TOPP_advanced). + + X! Tandem settings not exposed by this adapter can be directly adjusted using an XML configuration file. + By default, all (!) parameters available explicitly via this wrapper take precedence over the XML configuration file. + The parameter "default_config_file" can be used to specify such a custom configuration. + An example of a configuration file (named "default_input.xml") is contained in the "bin" folder of the + @em X! Tandem installation and the OpenMS installation under OpenMS/share/CHEMISTRY/XTandem_default_input.xml. + The latter is loaded by default. + If you want to use the XML configuration file and @em ignore most of the parameters set via this adapter, use the '-ignore_adapter_param' + flag. Then, the config given in '-default_config_file' is used exclusively and only '-in', '-out', '-database' and '-xtandem_executable' are + taken from this adapter. + + @note Currently mzIdentML (mzid) is not directly supported as an input/output format of this tool. Convert mzid files to/from idXML using @ref TOPP_IDFileConverter if necessary. + + The command line parameters of this tool are: + @verbinclude TOPP_XTandemAdapter.cli + INI file documentation of this tool: + @htmlinclude TOPP_XTandemAdapter.html +*/ + +// We do not want this class to show up in the docu: +/// @cond TOPPCLASSES + + +class TOPPCOMETAdapter : + public TOPPBase +{ +public: + TOPPCOMETAdapter() : + TOPPBase("COMETAdapter", "Annotates MS/MS spectra using COMET.") + { + } + +protected: + void registerOptionsAndFlags_() + { + + registerInputFile_("in", "", "", "Input file"); + setValidFormats_("in", ListUtils::create("mzML")); + registerOutputFile_("out", "", "", "Output file"); + setValidFormats_("out", ListUtils::create("idXML")); + registerInputFile_("database", "", "", "FASTA file or pro file. Non-existing relative file-names are looked up via'OpenMS.ini:id_db_dir'", true, false, ListUtils::create("skipexists")); + setValidFormats_("database", ListUtils::create("FASTA")); + registerInputFile_("comet_executable", "", + // choose the default value according to the platform where it will be executed + // X! Tandem compiles as tandem on OSX and tandem.exe on any other platform +#if defined(__APPLE__) + "comet.exe", +#else + "comet.exe", +#endif + "Comet executable of the installation e.g. 'comet.exe'", true, false, ListUtils::create("skipexists")); + + addEmptyLine_(); + // + // Optional parameters (if '-ignore_adapter_param' is set) + // + + registerDoubleOption_("peptide_mass_tolerance", "", 10.0, "peptide_mass_tolerance", false); + registerDoubleOption_("precursor_mass_tolerance", "", 10.0, "Precursor mass tolerance", false); + registerDoubleOption_("fragment_mass_tolerance", "", 0.3, "Fragment mass error", false); + + registerIntOption_("peptide_mass_units", "", 2, "0=amu, 1=mmu, 2=ppm", false); + registerStringOption_("precursor_error_units", "", "ppm", "Parent monoisotopic mass error units", false); + registerStringOption_("fragment_error_units", "", "Da", "Fragment monoisotopic mass error units", false); + vector valid_strings = ListUtils::create("ppm,Da"); + + registerIntOption_("mass_type_parent", "", 1, "0=average masses, 1=monoisotopic masses", false); + registerIntOption_("mass_type_fragment", "", 1, "0=average masses, 1=monoisotopic masses", false); + registerIntOption_("precursor_tolerance_type", "", 0, "0=average masses, 1=monoisotopic masses", false); + registerIntOption_("isotope_error", "", 0, "0=off, 1=on -1/0/1/2/3 (standard C13 error), 2= -8/-4/0/4/8 (for +4/+8 labeling)", false); + + registerIntOption_("num_enzyme_termini", "", 2, "1 (semi-digested), 2 (fully digested, default), 8 C-term unspecific , 9 N-term unspecific", false); + registerIntOption_("allowed_missed_cleavages", "", 1, "Number of possible cleavage sites missed by the enzyme, maximum value is 5; for enzyme search", false); + + registerStringOption_("decoy_prefix", "", "rev_", "decoy entries are denoted by this string which is pre-pended to each protein accession", false); + + //setValidStrings_("precursor_error_units", valid_strings); + //setValidStrings_("fragment_error_units", valid_strings); + + //registerIntOption_("min_precursor_charge", "", 2, "Minimum precursor charge", false); + //registerIntOption_("max_precursor_charge", "", 4, "Maximum precursor charge", false); + //registerStringList_("fixed_modifications", "", ListUtils::create(""), "Fixed modifications, specified using UniMod (www.unimod.org) terms, e.g. 'Carbamidomethyl (C)' or 'Oxidation (M)'", false); + vector all_mods; + ModificationsDB::getInstance()->getAllSearchModifications(all_mods); + //setValidStrings_("fixed_modifications", all_mods); + registerStringList_("variable_modifications", "", ListUtils::create(""), "Variable modifications, specified using UniMod (www.unimod.org) terms, e.g. 'Carbamidomethyl (C)' or 'Oxidation (M)'", false); + setValidStrings_("variable_modifications", all_mods); + + addEmptyLine_(); + + vector all_enzymes; + EnzymesDB::getInstance()->getAllXTandemNames(all_enzymes); + registerStringOption_("cleavage_site", "", "Trypsin", "The enzyme used for peptide digestion.", false); + setValidStrings_("cleavage_site", all_enzymes); + } + + vector getModifications_(StringList modNames) + { + vector modifications; + + // iterate over modification names and add to vector + for (StringList::iterator mod_it = modNames.begin(); mod_it != modNames.end(); ++mod_it) + { + String modification(*mod_it); + modifications.push_back(ModificationsDB::getInstance()->getModification(modification)); + } + + return modifications; + } + + void createParamFile_(ostream& os) + { + os << "database_name = " << getStringOption_("database") << "\n"; + os << "decoy_search = " << 0 << "\n"; // 0=no (default), 1=concatenated search, 2=separate search + os << "num_threads = " << getIntOption_("threads") << "\n"; // 0=poll CPU to set num threads; else specify num threads directly (max 64) + + // masses + os << "peptide_mass_tolerance = " << getDoubleOption_("peptide_mass_tolerance") << "\n"; + os << "peptide_mass_units = " << getIntOption_("peptide_mass_units") << "\n"; // 0=amu, 1=mmu, 2=ppm + os << "mass_type_parent = " << getIntOption_("mass_type_parent") << "\n"; // 0=average masses, 1=monoisotopic masses + os << "mass_type_fragment = " << getIntOption_("mass_type_fragment") << "\n"; // 0=average masses, 1=monoisotopic masses + os << "precursor_tolerance_type = " << getIntOption_("precursor_tolerance_type") << "\n"; // 0=MH+ (default), 1=precursor m/z; only valid for amu/mmu tolerances + os << "isotope_error = " << getIntOption_("isotope_error") << "\n"; // 0=off, 1=on -1/0/1/2/3 (standard C13 error), 2= -8/-4/0/4/8 (for +4/+8 labeling) + + // search enzyme + + // TODO: complete + map map_oms2comet; + map_oms2comet["Trypsin"] = 1; + map_oms2comet["Arg-C"] = 5; + map_oms2comet["Asp-N"] = 6; + map_oms2comet["Chymotrypsin"] = 10; + map_oms2comet["CNBr"] = 7; + map_oms2comet["Lys-C"] = 3; + map_oms2comet["PepsinA"] = 9; + map_oms2comet["Trypsin/P"] = 2; + map_oms2comet["no cleavage"] = 0; + + String enzyme_name = getStringOption_("cleavage_site"); + Size enzyme_number = 1; + if (map_oms2comet.find(enzyme_name) != map_oms2comet.end()) + { + enzyme_number = map_oms2comet.at(enzyme_name); + } + else + { + throw OpenMS::Exception::IllegalArgument(__FILE__, __LINE__, __FUNCTION__, "Error: Enzyme not supported. " + enzyme_name); + } + + os << "search_enzyme_number = " << enzyme_number << "\n"; // choose from list at end of this params file + os << "num_enzyme_termini = " << getIntOption_("num_enzyme_termini") << "\n"; // 1 (semi-digested), 2 (fully digested, default), 8 C-term unspecific , 9 N-term unspecific + os << "allowed_missed_cleavage = " << getIntOption_("allowed_missed_cleavages") << "\n"; // maximum value is 5; for enzyme search + + // Up to 9 variable modifications are supported + // format: <0=variable/else binary> + // e.g. 79.966331 STY 0 3 -1 0 0 + vector variable_modifications_names = getStringList_("variable_modifications"); + vector variable_modifications = getModifications_(variable_modifications_names); + if (variable_modifications.size() > 9) + { + throw OpenMS::Exception::IllegalArgument(__FILE__, __LINE__, __FUNCTION__, "Error: Comet only supports 9 variable modifications. " + String(variable_modifications.size()) + " provided."); + } + + Size var_mod_index = 1; + + // write out user specified modifications + for (; var_mod_index <= variable_modifications.size(); ++var_mod_index) + { + const ResidueModification mod = variable_modifications[var_mod_index]; + double mass = mod.getDiffMonoMass(); + String residues = mod.getOrigin(); // TODO: check if origin contains C-term string or similar. Should not be passed to commet as residue string + String variable = "0"; + String max_mods_per_peptide = "3"; + String term_distance = "-1"; + String nc_term = "0"; + + if (mod.getTermSpecificity() == ResidueModification::C_TERM) + { + term_distance = 0; + nc_term = "3"; + } + else if (mod.getTermSpecificity() == ResidueModification::N_TERM) + { + term_distance = 0; + nc_term = "2"; + } + else if (mod.getTermSpecificity() == ResidueModification::PROTEIN_N_TERM) + { + term_distance = 0; + nc_term = "0"; + } + else if (mod.getTermSpecificity() == ResidueModification::PROTEIN_C_TERM) + { + term_distance = 0; + nc_term = "1"; + } + String required = "0"; + + os << "variable_mod0" << var_mod_index << " = " << mass << " " << residues << " " << variable << " " << max_mods_per_peptide << " " << term_distance << " " << nc_term << " " << required << "\n"; + } + + // fill remaining modification slots (if any) in Comet with "no modification" + for (; var_mod_index <= 9; ++var_mod_index) + { + os << "variable_mod0" << var_mod_index << " = " << "0.0 X 0 3 -1 0 0" << "\n"; + } + + os << "max_variable_mods_in_peptide = " << 5 << "\n"; + os << "require_variable_mod = " << 0 << "\n"; + + // fragment ions + // ion trap ms/ms: 1.0005 tolerance, 0.4 offset (mono masses), theoretical_fragment_ions = 1 + // high res ms/ms: 0.02 tolerance, 0.0 offset (mono masses), theoretical_fragment_ions = 0 + os << "fragment_bin_tol = " << 1.0005 << "\n"; // binning to use on fragment ions + os << "fragment_bin_offset = " << 0.4 << "\n"; // offset position to start the binning (0.0 to 1.0) + os << "theoretical_fragment_ions = " << 1 << "\n"; // 0=use flanking peaks, 1=M peak only + os << "use_A_ions = " << 0 << "\n"; + os << "use_B_ions = " << 1 << "\n"; + os << "use_C_ions = " << 0 << "\n"; + os << "use_X_ions = " << 0 << "\n"; + os << "use_Y_ions = " << 1 << "\n"; + os << "use_Z_ions = " << 0 << "\n"; + os << "use_NL_ions = " << 0 << "\n"; // 0=no, 1=yes to consider NH3/H2O neutral loss peaks + + // output + os << "output_sqtstream = " << 0 << "\n"; // 0=no, 1=yes write sqt to standard output + os << "output_sqtfile = " << 0 << "\n"; // 0=no, 1=yes write sqt file + os << "output_txtfile = " << 0 << "\n"; // 0=no, 1=yes write tab-delimited txt file + os << "output_pepxmlfile = " << 1 << "\n"; // 0=no, 1=yes write pep.xml file + os << "output_percolatorfile = " << 1 << "\n"; // 0=no, 1=yes write Percolator tab-delimited input file + os << "output_outfiles = " << 0 << "\n"; // 0=no, 1=yes write .out files + os << "print_expect_score = " << 1 << "\n"; // 0=no, 1=yes to replace Sp with expect in out & sqt + os << "num_output_lines = " << 5 << "\n"; // num peptide results to show + os << "show_fragment_ions = " << 0 << "\n"; // 0=no, 1=yes for out files only + os << "sample_enzyme_number = " << 0 << "\n"; // Sample enzyme which is possibly different than the one applied to the search. + + // mzXML parameters + os << "scan_range = " << "0 0" << "\n"; // start and scan scan range to search; 0 as 1st entry ignores parameter + os << "precursor_charge = " << "0 0" << "\n"; // precursor charge range to analyze; does not override any existing charge; 0 as 1st entry ignores parameter + os << "override_charge = " << 0 << "\n"; // 0=no, 1=override precursor charge states, 2=ignore precursor charges outside precursor_charge range, 3=see online + os << "ms_level = " << 2 << "\n"; // MS level to analyze, valid are levels 2 (default) or 3 + os << "activation_method = " << "ALL" << "\n"; // activation method; used if activation method set; allowed ALL, CID, ECD, ETD, PQD, HCD, IRMPD + + // misc parameters + os << "digest_mass_range = " << "600.0 5000.0" << "\n"; // MH+ peptide mass range to analyze + os << "num_results = " << 100 << "\n"; // number of search hits to store internally + os << "skip_researching = " << 1 << "\n"; // for '.out' file output only, 0=search everything again (default), 1=dont search if .out exists + os << "max_fragment_charge = " << 3 << "\n"; // set maximum fragment charge state to analyze (allowed max 5) + os << "max_precursor_charge = " << 4 << "\n"; // set maximum precursor charge state to analyze (allowed max 9) + os << "nucleotide_reading_frame = " << 0 << "\n"; // 0=proteinDB, 1-6, 7=forward three, 8=reverse three, 9=all six + os << "clip_nterm_methionine = " << 0 << "\n"; // 0=leave sequences as-is; 1=also consider sequence w/o N-term methionine + os << "spectrum_batch_size = " << 0 << "\n"; // max. // of spectra to search at a time; 0 to search the entire scan range in one loop + os << "decoy_prefix = " << getStringOption_("decoy_prefix") << "\n"; // decoy entries are denoted by this string which is pre-pended to each protein accession + os << "output_suffix = " << "" << "\n"; // add a suffix to output base names i.e. suffix "-C" generates base-C.pep.xml from base.mzXML input + os << "mass_offsets = " << "" << "\n"; // one or more mass offsets to search (values substracted from deconvoluted precursor mass) + + // spectral processing + os << "minimum_peaks = " << 10 << "\n"; // required minimum number of peaks in spectrum to search (default 10) + os << "minimum_intensity = " << 0 << "\n"; // minimum intensity value to read in + os << "remove_precursor_peak = " << 0 << "\n"; // 0=no, 1=yes, 2=all charge reduced precursor peaks (for ETD) + os << "remove_precursor_tolerance = " << 1.5 << "\n"; // +- Da tolerance for precursor removal + os << "clear_mz_range = " << "0.0 0.0" << "\n"; // for iTRAQ/TMT type data; will clear out all peaks in the specified m/z range + + // additional modifications + os << "add_Cterm_peptide = " << 0.0 << "\n"; + os << "add_Nterm_peptide = " << 0.0 << "\n"; + os << "add_Cterm_protein = " << 0.0 << "\n"; + os << "add_Nterm_protein = " << 0.0 << "\n"; + os << "add_G_glycine = " << 0.0000 << "\n"; // added to G - avg. 57.0513, mono. 57.02146 + os << "add_A_alanine = " << 0.0000 << "\n"; // added to A - avg. 71.0779, mono. 71.03711 + os << "add_S_serine = " << 0.0000 << "\n"; // added to S - avg. 87.0773, mono. 87.03203 + os << "add_P_proline = " << 0.0000 << "\n"; // added to P - avg. 97.1152, mono. 97.05276 + os << "add_V_valine = " << 0.0000 << "\n"; // added to V - avg. 99.1311, mono. 99.06841 + os << "add_T_threonine = " << 0.0000 << "\n"; // added to T - avg. 101.1038, mono. 101.04768 + os << "add_C_cysteine = " << 0.0000 << "\n"; // added to C - avg. 103.1429, mono. 103.00918 + os << "add_L_leucine = " << 0.0000 << "\n"; // added to L - avg. 113.1576, mono. 113.08406 + os << "add_I_isoleucine = " << 0.0000 << "\n"; // added to I - avg. 113.1576, mono. 113.08406 + os << "add_N_asparagine = " << 0.0000 << "\n"; // added to N - avg. 114.1026, mono. 114.04293 + os << "add_D_aspartic_acid = " << 0.0000 << "\n"; // added to D - avg. 115.0874, mono. 115.02694 + os << "add_Q_glutamine = " << 0.0000 << "\n"; // added to Q - avg. 128.1292, mono. 128.05858 + os << "add_K_lysine = " << 0.0000 << "\n"; // added to K - avg. 128.1723, mono. 128.09496 + os << "add_E_glutamic_acid =" << 0.0000 << "\n"; // added to E - avg. 129.1140, mono. 129.04259 + os << "add_M_methionine =" << 0.0000 << "\n"; // added to M - avg. 131.1961, mono. 131.04048 + os << "add_O_ornithine = " << 0.0000 << "\n"; // added to O - avg. 132.1610, mono 132.08988 + os << "add_H_histidine = " << 0.0000 << "\n"; // added to H - avg. 137.1393, mono. 137.05891 + os << "add_F_phenylalanine = " << 0.0000 << "\n"; // added to F - avg. 147.1739, mono. 147.06841 + os << "add_U_selenocysteine = " << 0.0000 << "\n"; // added to U - avg. 150.3079, mono. 150.95363 + os << "add_R_arginine = " << 0.0000 << "\n"; // added to R - avg. 156.1857, mono. 156.10111 + os << "add_Y_tyrosine = " << 0.0000 << "\n"; // added to Y - avg. 163.0633, mono. 163.06333 + os << "add_W_tryptophan = " << 0.0000 << "\n"; // added to W - avg. 186.0793, mono. 186.07931 + os << "add_B_user_amino_acid = " << 0.0000 << "\n"; // added to B - avg. 0.0000, mono. 0.00000 + os << "add_J_user_amino_acid = " << 0.0000 << "\n"; // added to J - avg. 0.0000, mono. 0.00000 + os << "add_X_user_amino_acid = " << 0.0000 << "\n"; // added to X - avg. 0.0000, mono. 0.00000 + os << "add_Z_user_amino_acid = " << 0.0000 << "\n"; // added to Z - avg. 0.0000, mono. 0.00000 + + // COMET_ENZYME_INFO _must_ be at the end of this parameters file + os << "[COMET_ENZYME_INFO]"; + os << "0. No_enzyme 0 - -" << "\n"; + os << "1. Trypsin 1 KR P" << "\n"; + os << "2. Trypsin/P 1 KR -" << "\n"; + os << "3. Lys_C 1 K P" << "\n"; + os << "4. Lys_N 0 K -" << "\n"; + os << "5. Arg_C 1 R P" << "\n"; + os << "6. Asp_N 0 D -" << "\n"; + os << "7. CNBr 1 M -" << "\n"; + os << "8. Glu_C 1 DE P" << "\n"; + os << "9. PepsinA 1 FL P" << "\n"; + os << "10. Chymotrypsin 1 FWYL P" << "\n"; + } + + ExitCodes main_(int, const char**) + { + //------------------------------------------------------------- + // parsing parameters + //------------------------------------------------------------- + + String inputfile_name = getStringOption_("in"); + writeDebug_(String("Input file: ") + inputfile_name, 1); + if (inputfile_name == "") + { + writeLog_("No input file specified. Aborting!"); + printUsage_(); + return ILLEGAL_PARAMETERS; + } + + String outputfile_name = getStringOption_("out"); + writeDebug_(String("Output file: ") + outputfile_name, 1); + if (outputfile_name == "") + { + writeLog_("No output file specified. Aborting!"); + printUsage_(); + return ILLEGAL_PARAMETERS; + } + + + //------------------------------------------------------------- + // reading input + //------------------------------------------------------------- + + String db_name(getStringOption_("database")); + if (!File::readable(db_name)) + { + String full_db_name; + try + { + full_db_name = File::findDatabase(db_name); + } + catch (...) + { + printUsage_(); + return ILLEGAL_PARAMETERS; + } + db_name = full_db_name; + } + + ofstream os("exampleParams.txt"); + createParamFile_(os); + os.close(); + + PeakMap exp; + MzMLFile mzml_file; + mzml_file.getOptions().addMSLevel(2); // only load msLevel 2 + mzml_file.setLogType(log_type_); + mzml_file.load(inputfile_name, exp); + + if (exp.getSpectra().empty()) + { + throw OpenMS::Exception::FileEmpty(__FILE__, __LINE__, __FUNCTION__, "Error: No MS2 spectra in input file."); + } + + // determine type of spectral data (profile or centroided) + SpectrumSettings::SpectrumType spectrum_type = exp[0].getType(); + + if (spectrum_type == SpectrumSettings::RAWDATA) + { + if (!getFlag_("force")) + { + throw OpenMS::Exception::IllegalArgument(__FILE__, __LINE__, __FUNCTION__, "Error: Profile data provided but centroided MS2 spectra expected. To enforce processing of the data set the -force flag."); + } + } + + + //------------------------------------------------------------- + // calculations + //------------------------------------------------------------- + + String comet_executable(getStringOption_("comet_executable")); + int status = QProcess::execute(comet_executable.toQString(), QStringList(inputfile_name.toQString())); // does automatic escaping etc... + if (status != 0) + { + writeLog_("Comet problem. Aborting! Calling command was: '" + comet_executable + " \"" + inputfile_name + "\"'.\nDoes the Comet executable exist?"); + // clean temporary files + /* + if (this->debug_level_ < 2) + { + File::removeDirRecursively(temp_directory); + LOG_WARN << "Set debug level to >=2 to keep the temporary files at '" << temp_directory << "'" << std::endl; + } + else + { + LOG_WARN << "Keeping the temporary files at '" << temp_directory << "'. Set debug level to <2 to remove them." << std::endl; + } + */ + //return EXTERNAL_PROGRAM_ERROR; + } + } +}; + //vector protein_ids; + //ProteinIdentification protein_id; + //protein_id.setPrimaryMSRunPath(exp.getPrimaryMSRunPath()); + //vector peptide_ids; + +/* + // read the output of X! Tandem and write it to idXML + XTandemXMLFile tandem_output; + tandem_output.setModificationDefinitionsSet(ModificationDefinitionsSet(getStringList_("fixed_modifications"), getStringList_("variable_modifications"))); + // find the file, because XTandem extends the filename with a timestamp we do not know (exactly) + StringList files; + File::fileList(temp_directory, "_tandem_output_file*.xml", files); + if (files.size() != 1) + { + throw Exception::FileNotFound(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, tandem_output_filename); + } + tandem_output.load(temp_directory + files[0], protein_id, peptide_ids); + + // now put the RTs into the peptide_ids from the spectrum ids + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + { + UInt id = (Int)it->getMetaValue("spectrum_id"); + --id; // native IDs were written 1-based + if (id < exp.size()) + { + it->setRT(exp[id].getRT()); + double pre_mz(0.0); + if (!exp[id].getPrecursors().empty()) pre_mz = exp[id].getPrecursors()[0].getMZ(); + it->setMZ(pre_mz); + //it->removeMetaValue("spectrum_id"); + } + else + { + LOG_ERROR << "XTandemAdapter: Error: id '" << id << "' not found in peak map!" << endl; + } + } + + //------------------------------------------------------------- + // writing output + //------------------------------------------------------------- + + // handle the search parameters + ProteinIdentification::SearchParameters search_parameters; + search_parameters.db = getStringOption_("database"); + search_parameters.charges = "+" + String(getIntOption_("min_precursor_charge")) + "-+" + String(getIntOption_("max_precursor_charge")); + + ProteinIdentification::PeakMassType mass_type = ProteinIdentification::MONOISOTOPIC; + search_parameters.mass_type = mass_type; + search_parameters.fixed_modifications = getStringList_("fixed_modifications"); + search_parameters.variable_modifications = getStringList_("variable_modifications"); + search_parameters.missed_cleavages = getIntOption_("missed_cleavages"); + search_parameters.fragment_mass_tolerance = getDoubleOption_("fragment_mass_tolerance"); + search_parameters.precursor_mass_tolerance = getDoubleOption_("precursor_mass_tolerance"); + search_parameters.precursor_mass_tolerance_ppm = getStringOption_("precursor_error_units") == "ppm" ? true : false; + search_parameters.fragment_mass_tolerance_ppm = getStringOption_("fragment_error_units") == "ppm" ? true : false; + search_parameters.digestion_enzyme = *EnzymesDB::getInstance()->getEnzyme(enzyme_name); + protein_id.setSearchParameters(search_parameters); + protein_id.setSearchEngineVersion(""); + protein_id.setSearchEngine("XTandem"); + + protein_ids.push_back(protein_id); + + IdXMLFile().store(outputfile_name, protein_ids, peptide_ids); + + /// Deletion of temporary files + if (this->debug_level_ < 2) + { + File::removeDirRecursively(temp_directory); + LOG_WARN << "Set debug level to >=2 to keep the temporary files at '" << temp_directory << "'" << std::endl; + } + else + { + LOG_WARN << "Keeping the temporary files at '" << temp_directory << "'. Set debug level to <2 to remove them." << std::endl; + } + + // some stats + LOG_INFO << "Statistics:\n" + << " identified MS2 spectra: " << peptide_ids.size() << " / " << exp.size() << " = " << int(peptide_ids.size() * 100.0 / exp.size()) << "% (with e-value < " << String(getDoubleOption_("max_valid_expect")) << ")" << std::endl; + + + return EXECUTION_OK; + } +abi +}; +*/ + + +int main(int argc, const char** argv) +{ + TOPPCOMETAdapter tool; + + return tool.main(argc, argv); +} + +/// @endcond diff --git a/src/utils/executables.cmake b/src/utils/executables.cmake index 3ab1f21fbbe..c04ae541ef4 100644 --- a/src/utils/executables.cmake +++ b/src/utils/executables.cmake @@ -4,6 +4,7 @@ set(directory source/APPLICATIONS/UTILS) ### list all filenames of the directory here set(UTILS_executables AccurateMassSearch +COMETAdapter CVInspector DecoyDatabase DatabaseFilter From 93113bcd960443927cc0bcb21f9e737f4650894d Mon Sep 17 00:00:00 2001 From: Leon Bichmann Date: Wed, 22 Mar 2017 14:55:29 +0100 Subject: [PATCH 02/41] COMETAdapter changes - PepXML to idXML error not fixed --- .gitignore | 3 + src/utils/COMETAdapter.cpp | 140 ++++++++++++++----------------------- 2 files changed, 54 insertions(+), 89 deletions(-) diff --git a/.gitignore b/.gitignore index b6cb6f7c796..b93f0dcdd2c 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,6 @@ OpenMS.config OpenMS.creator OpenMS.files OpenMS.includes +.DS_Store +src/.DS_Store +src/openms/.DS_Store diff --git a/src/utils/COMETAdapter.cpp b/src/utils/COMETAdapter.cpp index 803916591bd..9042b6669cd 100755 --- a/src/utils/COMETAdapter.cpp +++ b/src/utils/COMETAdapter.cpp @@ -29,11 +29,12 @@ // // -------------------------------------------------------------------------- // $Maintainer: Chris Bielow $ -// $Authors: Andreas Bertsch, Chris Bielow $ +// $Authors: Leon Bichmann, Andreas Bertsch, Chris Bielow $ // -------------------------------------------------------------------------- #include #include +#include #include #include #include @@ -49,8 +50,8 @@ #include #include #include +#include #include // std::cout, std::ostream, std::ios - #include using namespace OpenMS; @@ -200,8 +201,26 @@ class TOPPCOMETAdapter : return modifications; } + void removeTempDir_(const String& tmp_dir) + { + if (tmp_dir.empty()) return; // no temp. dir. created + + if (debug_level_ >= 2) + { + writeDebug_("Keeping temporary files in directory '" + tmp_dir + "'. Set debug level to 1 or lower to remove them.", 2); + } + else + { + if (debug_level_ == 1) writeDebug_("Deleting temporary directory '" + tmp_dir + "'. Set debug level to 2 or higher to keep it.", 1); + File::removeDirRecursively(tmp_dir); + } + } + void createParamFile_(ostream& os) { + os << "# comet_version 2016.01 rev. 2\n"; //required as first line in the param file + os << "# Comet MS/MS search engine parameters file.\n"; + os << "# Everything following the '#' symbol is treated as a comment.\n"; os << "database_name = " << getStringOption_("database") << "\n"; os << "decoy_search = " << 0 << "\n"; // 0=no (default), 1=concatenated search, 2=separate search os << "num_threads = " << getIntOption_("threads") << "\n"; // 0=poll CPU to set num threads; else specify num threads directly (max 64) @@ -444,8 +463,17 @@ class TOPPCOMETAdapter : } db_name = full_db_name; } - - ofstream os("exampleParams.txt"); + + //tmp_dir + const String tmp_dir = QDir::toNativeSeparators((File::getTempDirectory() + "/").toQString()); + writeDebug_("Creating temporary directory '" + tmp_dir + "'", 1); + QDir d; + d.mkpath(tmp_dir.toQString()); + + String tmp_file = tmp_dir + "param.txt"; + String tmp_pepxml = inputfile_name.substr(0,inputfile_name.rfind(".")) + ".pep.xml"; + + ofstream os(tmp_file); createParamFile_(os); os.close(); @@ -476,112 +504,46 @@ class TOPPCOMETAdapter : // calculations //------------------------------------------------------------- - String comet_executable(getStringOption_("comet_executable")); - int status = QProcess::execute(comet_executable.toQString(), QStringList(inputfile_name.toQString())); // does automatic escaping etc... + String param = "-P" + tmp_file; + QStringList process_params; + process_params << param.toQString() << inputfile_name.toQString(); + //qDebug() << process_params; + + String comet_executable = getStringOption_("comet_executable"); + //int status = QProcess::execute(comet_executable.toQString(), QStringList(inputfile_name.toQString())); // does automatic escaping etc... + int status = QProcess::execute(comet_executable.toQString(),process_params); // does automatic escaping etc... if (status != 0) { writeLog_("Comet problem. Aborting! Calling command was: '" + comet_executable + " \"" + inputfile_name + "\"'.\nDoes the Comet executable exist?"); // clean temporary files - /* if (this->debug_level_ < 2) { - File::removeDirRecursively(temp_directory); - LOG_WARN << "Set debug level to >=2 to keep the temporary files at '" << temp_directory << "'" << std::endl; + removeTempDir_(tmp_dir); + LOG_WARN << "Set debug level to >=2 to keep the temporary files at '" << tmp_dir << "'" << std::endl; } else { - LOG_WARN << "Keeping the temporary files at '" << temp_directory << "'. Set debug level to <2 to remove them." << std::endl; + LOG_WARN << "Keeping the temporary files at '" << tmp_dir << "'. Set debug level to <2 to remove them." << std::endl; } - */ //return EXTERNAL_PROGRAM_ERROR; } - } -}; - //vector protein_ids; - //ProteinIdentification protein_id; - //protein_id.setPrimaryMSRunPath(exp.getPrimaryMSRunPath()); - //vector peptide_ids; - -/* - // read the output of X! Tandem and write it to idXML - XTandemXMLFile tandem_output; - tandem_output.setModificationDefinitionsSet(ModificationDefinitionsSet(getStringList_("fixed_modifications"), getStringList_("variable_modifications"))); - // find the file, because XTandem extends the filename with a timestamp we do not know (exactly) - StringList files; - File::fileList(temp_directory, "_tandem_output_file*.xml", files); - if (files.size() != 1) - { - throw Exception::FileNotFound(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, tandem_output_filename); - } - tandem_output.load(temp_directory + files[0], protein_id, peptide_ids); - // now put the RTs into the peptide_ids from the spectrum ids - for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) - { - UInt id = (Int)it->getMetaValue("spectrum_id"); - --id; // native IDs were written 1-based - if (id < exp.size()) - { - it->setRT(exp[id].getRT()); - double pre_mz(0.0); - if (!exp[id].getPrecursors().empty()) pre_mz = exp[id].getPrecursors()[0].getMZ(); - it->setMZ(pre_mz); - //it->removeMetaValue("spectrum_id"); - } - else - { - LOG_ERROR << "XTandemAdapter: Error: id '" << id << "' not found in peak map!" << endl; - } - } + // read the pep.xml output of COMET and write it to idXML + + vector peptide_identifications; + vector protein_identifications; + + PepXMLFile().load(tmp_pepxml, protein_identifications, peptide_identifications, inputfile_name); //------------------------------------------------------------- // writing output //------------------------------------------------------------- - // handle the search parameters - ProteinIdentification::SearchParameters search_parameters; - search_parameters.db = getStringOption_("database"); - search_parameters.charges = "+" + String(getIntOption_("min_precursor_charge")) + "-+" + String(getIntOption_("max_precursor_charge")); - - ProteinIdentification::PeakMassType mass_type = ProteinIdentification::MONOISOTOPIC; - search_parameters.mass_type = mass_type; - search_parameters.fixed_modifications = getStringList_("fixed_modifications"); - search_parameters.variable_modifications = getStringList_("variable_modifications"); - search_parameters.missed_cleavages = getIntOption_("missed_cleavages"); - search_parameters.fragment_mass_tolerance = getDoubleOption_("fragment_mass_tolerance"); - search_parameters.precursor_mass_tolerance = getDoubleOption_("precursor_mass_tolerance"); - search_parameters.precursor_mass_tolerance_ppm = getStringOption_("precursor_error_units") == "ppm" ? true : false; - search_parameters.fragment_mass_tolerance_ppm = getStringOption_("fragment_error_units") == "ppm" ? true : false; - search_parameters.digestion_enzyme = *EnzymesDB::getInstance()->getEnzyme(enzyme_name); - protein_id.setSearchParameters(search_parameters); - protein_id.setSearchEngineVersion(""); - protein_id.setSearchEngine("XTandem"); - - protein_ids.push_back(protein_id); - - IdXMLFile().store(outputfile_name, protein_ids, peptide_ids); - - /// Deletion of temporary files - if (this->debug_level_ < 2) - { - File::removeDirRecursively(temp_directory); - LOG_WARN << "Set debug level to >=2 to keep the temporary files at '" << temp_directory << "'" << std::endl; - } - else - { - LOG_WARN << "Keeping the temporary files at '" << temp_directory << "'. Set debug level to <2 to remove them." << std::endl; - } + IdXMLFile().store(outputfile_name, protein_identifications, peptide_identifications); - // some stats - LOG_INFO << "Statistics:\n" - << " identified MS2 spectra: " << peptide_ids.size() << " / " << exp.size() << " = " << int(peptide_ids.size() * 100.0 / exp.size()) << "% (with e-value < " << String(getDoubleOption_("max_valid_expect")) << ")" << std::endl; - - - return EXECUTION_OK; } -abi + }; -*/ int main(int argc, const char** argv) From f4efe76d60ae7d5b705daf38adf087aae0b923b7 Mon Sep 17 00:00:00 2001 From: Leon Bichmann Date: Fri, 24 Mar 2017 11:47:08 +0100 Subject: [PATCH 03/41] COMET Adapter now functional and removes pep.xml and .pin if debug ==0 --- src/openms/source/FORMAT/PepXMLFile.cpp | 1 + src/utils/COMETAdapter.cpp | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/openms/source/FORMAT/PepXMLFile.cpp b/src/openms/source/FORMAT/PepXMLFile.cpp index d5ce4b4ae9e..83b30f5ed20 100755 --- a/src/openms/source/FORMAT/PepXMLFile.cpp +++ b/src/openms/source/FORMAT/PepXMLFile.cpp @@ -767,6 +767,7 @@ namespace OpenMS if (!exp_name_.empty()) { String base_name = attributeAsString_(attributes, "base_name"); + cout << "base name:" << base_name << " exp_name_:" << exp_name_ << endl; if (!base_name.empty()) { wrong_experiment_ = !base_name.hasSuffix(exp_name_); diff --git a/src/utils/COMETAdapter.cpp b/src/utils/COMETAdapter.cpp index 9042b6669cd..48ef6f7e206 100755 --- a/src/utils/COMETAdapter.cpp +++ b/src/utils/COMETAdapter.cpp @@ -405,7 +405,7 @@ class TOPPCOMETAdapter : os << "add_Z_user_amino_acid = " << 0.0000 << "\n"; // added to Z - avg. 0.0000, mono. 0.00000 // COMET_ENZYME_INFO _must_ be at the end of this parameters file - os << "[COMET_ENZYME_INFO]"; + os << "[COMET_ENZYME_INFO]" << "\n"; os << "0. No_enzyme 0 - -" << "\n"; os << "1. Trypsin 1 KR P" << "\n"; os << "2. Trypsin/P 1 KR -" << "\n"; @@ -471,7 +471,8 @@ class TOPPCOMETAdapter : d.mkpath(tmp_dir.toQString()); String tmp_file = tmp_dir + "param.txt"; - String tmp_pepxml = inputfile_name.substr(0,inputfile_name.rfind(".")) + ".pep.xml"; + String tmp_pepxml = File::removeExtension(inputfile_name) + ".pep.xml"; + String tmp_pin = File::removeExtension(inputfile_name) + ".pin"; ofstream os(tmp_file); createParamFile_(os); @@ -533,8 +534,20 @@ class TOPPCOMETAdapter : vector peptide_identifications; vector protein_identifications; + writeDebug_("write PepXMLFile", 1); PepXMLFile().load(tmp_pepxml, protein_identifications, peptide_identifications, inputfile_name); + if (this->debug_level_ == 0) + { + File::remove(tmp_pepxml); + File::remove(tmp_pin); + LOG_WARN << "Set debug level to >0 to keep the temporary pep.xml and pin files at '" << tmp_pepxml << "'" << std::endl; + } + else + { + LOG_WARN << "Keeping the temporary files at '" << tmp_pepxml << "'. Set debug level to 0 to remove them." << std::endl; + } + //------------------------------------------------------------- // writing output //------------------------------------------------------------- From c1f7480fa60dfed97170e8a8127f3c8e21171da1 Mon Sep 17 00:00:00 2001 From: mwalzer Date: Tue, 26 May 2015 17:58:38 +0200 Subject: [PATCH 04/41] [NOP] documentation --- src/openms/source/FORMAT/HANDLERS/MzIdentMLDOMHandler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openms/source/FORMAT/HANDLERS/MzIdentMLDOMHandler.cpp b/src/openms/source/FORMAT/HANDLERS/MzIdentMLDOMHandler.cpp index 621ac3cd299..b7b6ae0a65f 100644 --- a/src/openms/source/FORMAT/HANDLERS/MzIdentMLDOMHandler.cpp +++ b/src/openms/source/FORMAT/HANDLERS/MzIdentMLDOMHandler.cpp @@ -2032,7 +2032,7 @@ namespace OpenMS hit.setMetaValue(up->first, up->second); } hit.setMetaValue("calcMZ", calculatedMassToCharge); - spectrum_identification.setMZ(experimentalMassToCharge); // TODO @ mths: why is this not in SpectrumIdentificationResult? exp. m/z for one spec should not change from one id for it to the next! + spectrum_identification.setMZ(experimentalMassToCharge); // TODO @ mths for next PSI meeting: why is this not in SpectrumIdentificationResult in the schema? exp. m/z for one spec should not change from one id for it to the next! hit.setMetaValue("pass_threshold", pass); //TODO @ mths do not write metavalue pass_threshold //connect the PeptideHit with PeptideEvidences (for AABefore/After) and subsequently with DBSequence (for ProteinAccession) From 831b4de8d69ed7f5c5d2739f110e9d3591a32678 Mon Sep 17 00:00:00 2001 From: mwalzer Date: Tue, 26 May 2015 17:59:28 +0200 Subject: [PATCH 05/41] [FIX] refactoring and cleanup of TopPerc --- src/utils/TopPerc.cpp | 860 +++++++++++++++++++++--------------------- 1 file changed, 437 insertions(+), 423 deletions(-) diff --git a/src/utils/TopPerc.cpp b/src/utils/TopPerc.cpp index 07289124548..afca2d454f3 100644 --- a/src/utils/TopPerc.cpp +++ b/src/utils/TopPerc.cpp @@ -40,17 +40,9 @@ #include #include #include -#include #include -#include -#include -#include -#include -#include -#include #include #include -#include #include #include @@ -90,7 +82,7 @@ using namespace std; -

Percolator is search engine sensitive, i.e. it's input has to vary, depending on the search engine.

+

Percolator is search engine sensitive, i.e. it's input features vary, depending on the search engine.

The command line parameters of this tool are: @verbinclude TOPP_TopPerc.cli @@ -114,9 +106,21 @@ class TOPPPercolator : } protected: - void preparePIN(vector& peptide_ids, bool is_decoy, TextFile& txt, int minCharge, int maxCharge) + void prepareMSGFpin(vector& peptide_ids, TextFile& txt, int minCharge, int maxCharge, char out_sep='\t') { - char out_sep = '\t'; + // Create String of the charges for the header of the tab file + stringstream ss; + ss << "Charge" << minCharge << ", "; + for (int j = minCharge + 1; j < maxCharge + 1; j++) + { + ss << "Charge" << j << ","; + } + + // Create header for the features + string featureset = "SpecId, Label,ScanNr, RawScore, DeNovoScore,ScoreRatio, Energy,lnEValue,IsotopeError, lnExplainedIonCurrentRatio,lnNTermIonCurrentRatio,lnCTermIonCurrentRatio,lnMS2IonCurrent,Mass,PepLen,dM,absdM,MeanErrorTop7,sqMeanErrorTop7,StdevErrorTop7," + ss.str() + "enzN,enzC,enzInt,Peptide,Proteins"; + StringList txt_header0 = ListUtils::create(featureset); + txt.addLine(ListUtils::concatenate(txt_header0, out_sep)); + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) { for (vector::const_iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) @@ -130,19 +134,20 @@ class TOPPPercolator : int rank = hit->getRank(); int charge = hit->getCharge(); - String spec_ref = it->getMetaValue("spectrum_id").toQString().toStdString(); //TODO consider other spectraIDFormats or keep index only in metavalue + String spec_ref = it->getMetaValue("spectrum_reference").toString(); vector scan_id; spec_ref.split("scan=", scan_id); + String sid = scan_id.back(); int label = 1; String SpecId = "target_SII_"; - if (is_decoy) + if ((String(hit->getMetaValue("target_decoy"))).hasSubstring("decoy")) { SpecId = "decoy_SII_"; label = -1; } - SpecId += scan_id[1] + "_" + String(rank) + "_" + scan_id[1] + "_" + String(charge) + "_" + String(rank); + SpecId += sid + "_" + String(rank) + "_" + sid + "_" + String(charge) + "_" + String(rank); double rawScore = hit->getMetaValue("MS:1002049").toString().toDouble(); double denovoScore = hit->getMetaValue("MS:1002050").toString().toDouble(); @@ -166,15 +171,13 @@ class TOPPPercolator : double lnMS2IonCurrent = log(hit->getMetaValue("MS2IonCurrent").toString().toDouble()); double expMass = it->getMZ(); double calcMass = hit->getMetaValue("calcMZ"); - int pepLen = hit->getSequence().toString().length(); + int pepLen = hit->getSequence().toUnmodifiedString().length(); double dM = (expMass - (isotopeError * Constants::NEUTRON_MASS_U / charge) - calcMass) / expMass; double absdM = abs(dM); - - double meanErrorTop7 = hit->getMetaValue("MeanErrorTop7").toString().toDouble(); int NumMatchedMainIons = hit->getMetaValue("NumMatchedMainIons").toString().toInt(); - double stdevErrorTop7 = 0.0; + double stdevErrorTop7 = 0.0; if (hit->getMetaValue("StdevErrorTop7").toString() != "NaN") { stdevErrorTop7 = hit->getMetaValue("StdevErrorTop7").toString().toDouble(); @@ -240,6 +243,235 @@ class TOPPPercolator : } } + void prepareXTANDEMpin(vector& peptide_ids, TextFile& txt, int minCharge, int maxCharge, char out_sep='\t') + { + // Create String of the charges for the header of the tab file + stringstream ss; + ss << "Charge" << minCharge << ", "; + for (int j = minCharge + 1; j < maxCharge + 1; j++) + { + + ss << "Charge" << j << ","; + } + + // Find out which ions are in XTandem-File and take only these as features + stringstream ss_ion; + if (peptide_ids.front().getHits().front().getMetaValue("a_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue("a_ions").toString() != "") + { + ss_ion << "frac_ion_a" << ","; + } + if (peptide_ids.front().getHits().front().getMetaValue("b_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue("b_ions").toString() != "") + { + ss_ion << "frac_ion_b" << ","; + } + if (peptide_ids.front().getHits().front().getMetaValue("c_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue("c_ions").toString() != "") + { + ss_ion << "frac_ion_c" << ","; + } + if (peptide_ids.front().getHits().front().getMetaValue("x_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue("x_ions").toString() != "") + { + ss_ion << "frac_ion_x" << ","; + } + if (peptide_ids.front().getHits().front().getMetaValue("y_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue("y_ions").toString() != "") + { + ss_ion << "frac_ion_y" << ","; + } + if (peptide_ids.front().getHits().front().getMetaValue("z_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue("z_ions").toString() != "") + { + ss_ion << "frac_ion_z" << ","; + } + + // Create header for the features + String featureset = "SpecId,Label,ScanNr,hyperscore,deltascore," + ss_ion.str() + + ",Mass,dM,absdM,PepLen," + ss.str() + "enzN,enzC,enzInt,Peptide,Proteins"; + StringList txt_header0 = ListUtils::create(featureset); + // Insert the header with the features names to the file + txt.addLine(ListUtils::concatenate(txt_header0, out_sep)); + + LOG_INFO << "read in target file" << endl; + // get all the features from the target file + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + { + if (it->isHigherScoreBetter()) + { + String scannumber = String(it->getMetaValue("spectrum_reference")); + int charge = it->getHits().front().getCharge(); + int label = 1; + double hyperscore = it->getHits().front().getScore(); + // deltascore = hyperscore - nextscore + double deltascore = hyperscore - it->getHits().front().getMetaValue("nextscore").toString().toDouble(); + String sequence = it->getHits().front().getSequence().toString(); + int length = sequence.length(); + + // Find out correct ion types and get its Values + stringstream ss_ion_2; + + if (it->getHits().front().getMetaValue("a_score").toString() != "" && + it->getHits().front().getMetaValue("a_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("a_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("b_score").toString() != "" && + it->getHits().front().getMetaValue("b_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("b_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("c_score").toString() != "" && + it->getHits().front().getMetaValue("c_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("c_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("x_score").toString() != "" && + it->getHits().front().getMetaValue("x_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("x_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("y_score").toString() != "" && + it->getHits().front().getMetaValue("y_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("y_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("z_score").toString() != "" && + it->getHits().front().getMetaValue("z_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("z_ions")) / length << out_sep; + } + double mass = it->getHits().front().getMetaValue("mass"); + double dm = it->getHits().front().getMetaValue("delta"); + double mh = mass + dm; + double absdM = abs(dm); + + // write 1 for the correct charge, 0 for other charges + // i.e.: charge 3 for charges from 2-5: 0 1 0 0 + stringstream ss; + int i = minCharge; + while (i <= maxCharge) + { + if (charge != i) + { + ss << "0" << out_sep; + } + if (charge == i) + { + ss << "1" << out_sep; + } + i++; + } + + char aaBefore = it->getHits().front().getPeptideEvidences().front().getAABefore(); + char aaAfter = it->getHits().front().getPeptideEvidences().front().getAAAfter(); + + String peptide = aaBefore + string(".") + sequence + string(".") + aaAfter; + + // formula taken from percolator converter isEnz(n, c) for trypsin + bool enzN = isEnz(peptide.at(0), peptide.at(2), getStringOption_("enzyme")); + bool enzC = isEnz(peptide.at(peptide.size() - 3), peptide.at(peptide.size() - 1), getStringOption_("enzyme")); + int enzInt = countEnzymatic(sequence, getStringOption_("enzyme")); + String protein = it->getHits().front().getPeptideEvidences().front().getProteinAccession(); + + // One PeptideSpectrumHit with all its features + String lis = "_tandem_output_file_target_" + scannumber + "_" + String(charge) + + "_1" + out_sep + String(label) + out_sep + scannumber + out_sep + String(hyperscore) + + out_sep + String(deltascore) + out_sep + ss_ion_2.str() + String(mh) + out_sep + + String(dm) + out_sep + String(absdM) + out_sep + String(length) + out_sep + String(ss.str()) + + String(enzN) + out_sep + String(enzC) + out_sep + String(enzInt) + out_sep + peptide + out_sep + protein; + + // peptide Spectrum Hit pushed to the output file + txt.addLine(lis); + } + } + + LOG_INFO << "read in decoy file" << endl; + // get all the features from the decoy file + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + { + if (it->isHigherScoreBetter()) + { + String scannumber = String(it->getMetaValue("spectrum_reference")); + int charge = it->getHits().front().getCharge(); + int label = -1; + double hyperscore = it->getHits().front().getScore(); + // deltascore = hyperscore - nextscore + double deltascore = hyperscore - it->getHits().front().getMetaValue("nextscore").toString().toDouble(); + String sequence = it->getHits().front().getSequence().toString(); + int length = sequence.length(); + + // Find out correct ion types and get its Values + stringstream ss_ion_2; + + if (it->getHits().front().getMetaValue("a_score").toString() != "" && it->getHits().front().getMetaValue("a_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("a_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("b_score").toString() != "" && it->getHits().front().getMetaValue("b_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("b_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("c_score").toString() != "" && it->getHits().front().getMetaValue("c_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("c_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("x_score").toString() != "" && it->getHits().front().getMetaValue("x_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("x_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("y_score").toString() != "" && it->getHits().front().getMetaValue("y_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("y_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("z_score").toString() != "" && it->getHits().front().getMetaValue("z_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("z_ions")) / length; + } + double mass = it->getHits().front().getMetaValue("mass"); + double dm = double(it->getHits().front().getMetaValue("delta")); + double mh = mass + dm; + double absdM = abs(dm); + + // write 1 for the correct charge, 0 for other charges + // i.e: charge 3 for charges from 2-5: 0 1 0 0 + stringstream ss; + int i = minCharge; + while (i <= maxCharge) + { + if (charge != i) + { + ss << "0" << out_sep; + } + if (charge == i) + { + ss << "1" << out_sep; + } + i++; + } + + char aaBefore = it->getHits().front().getPeptideEvidences().front().getAABefore(); + char aaAfter = it->getHits().front().getPeptideEvidences().front().getAAAfter(); + + String peptide = aaBefore + string(".") + sequence + string(".") + aaAfter; + + // formula taken from percolator converter isEnz(n, c) for trypsin + bool enzN = isEnz(peptide.at(0), peptide.at(2), getStringOption_("enzyme")); + bool enzC = isEnz(peptide.at(peptide.size() - 3), peptide.at(peptide.size() - 1), getStringOption_("enzyme")); + int enzInt = countEnzymatic(sequence, getStringOption_("enzyme")); + String protein = it->getHits().front().getPeptideEvidences().front().getProteinAccession(); + + // One PeptideSpectrumHit with all its features + String lis = "_tandem_output_file_decoy_" + scannumber + "_" + String(charge) + "_1" + out_sep + String(label) + out_sep + scannumber + out_sep + String(hyperscore) + out_sep + String(deltascore) + out_sep + ss_ion_2.str() + out_sep + + String(mh) + out_sep + String(dm) + out_sep + String(absdM) + out_sep + String(length) + out_sep + ss.str() + out_sep + String(enzN) + out_sep + String(enzC) + out_sep + String(enzInt) + out_sep + peptide + out_sep + protein; + + // peptide Spectrum Hit pushed to the output file + txt.addLine(lis); + } + } + } + // Function taken from Enzyme.h from Percolator bool isEnz(const char& n, const char& c, string enz) { @@ -329,57 +561,66 @@ class TOPPPercolator : void registerOptionsAndFlags_() { registerInputFile_("percolator_executable", "", "", "Path to the percolator binary", true, false, ListUtils::create("skipexists")); - registerInputFile_("in_target", "", "", "Input target file"); - registerInputFile_("in_decoy", "", "", "Input decoy file"); - setValidFormats_("in_target", ListUtils::create("mzid")); + registerInputFile_("in", "", "", "Input target file", true); + setValidFormats_("in", ListUtils::create("mzid")); + registerInputFile_("in_decoy", "", "", "Input decoy file", false); setValidFormats_("in_decoy", ListUtils::create("mzid")); - registerOutputFile_("out", "", "", "Output file", true); - registerStringOption_("enzyme", "", "trypsin", "Type of enzyme: no_enzyme,elastase,pepsin,proteinasek,thermolysin,chymotrypsin,lys-n,lys-c,arg-c,asp-n,glu-c,trypsin", false); - -// registerOutputFile_("r", "", "out", "Output tab delimited results to a file instead of stdout", false); - registerOutputFile_("X", "", "", "path to file in xml-output format (pout). Default is: pout.tab", false); - registerFlag_("e", "read xml-input format (pin) from standard input"); - registerFlag_("Z", "Include decoys (PSMs, peptides and/or proteins) in the xml-output. Only available if -X is used."); - registerDoubleOption_("p", "", 0.0, "Cpos, penalty for mistakes made on positive examples. Set by cross validation if not specified.", false); - registerDoubleOption_("n", "", 0.0, "Cneg, penalty for mistakes made on negative examples. Set by cross validation if not specified.", false); - registerDoubleOption_("F", "", 0.01, "False discovery rate threshold to define positive examples in training. Set by cross validation if 0. Default is 0.01.", false); - registerDoubleOption_("t", "", 0.01, "False discovery rate threshold for evaluating best cross validation result and the reported end result. Default is 0.01.", false); - registerIntOption_("i", "", 0, "Maximal number of iterations", false); - registerFlag_("x", "Quicker execution by reduced internal cross-validation."); - registerDoubleOption_("f", "", 0.6, "Fraction of the negative data set to be used as train set when only providing one negative set, remaining examples will be used as test set. Set to 0.6 by default.", false); - registerOutputFile_("J", "", "", "Output the computed features to the given file in tab-delimited format. A file with the features with the given file name will be created", false); - registerInputFile_("k", "", "", "Input file given in the deprecated pin-xml format generated by e.g. sqt2pin with the -k option", false); - registerOutputFile_("w", "", "", "Output final weights to the given file", false); - registerInputFile_("W", "", "", "Read initial weights to the given file", false); - registerStringOption_("V", "", "", "The most informative feature given as the feature name, can be negated to indicate that a lower value is better.", false); - registerIntOption_("v", "", 2, "Set verbosity of output: 0=no processing info, 5=all, default is 2", false); - registerFlag_("u", "Use unit normalization [0-1] instead of standard deviation normalization"); - registerFlag_("R", "Measure performance on test set each iteration"); - registerFlag_("O", "Override error check and do not fall back on default score vector in case of suspect score vector"); - registerIntOption_("S", "", 1, "Setting seed of the random number generator. Default value is 1", false); - registerFlag_("K", "Retention time features calculated as in Klammer et al."); - registerFlag_("D", "Include description of correct features"); - registerOutputFile_("B", "", "", "Output tab delimited results for decoys into a file", false); - registerFlag_("U", "Do not remove redundant peptides, keep all PSMS and exclude peptide level probabilities."); - registerFlag_("s", "skip validation of input file against xml schema"); - registerFlag_("A", "output protein level probabilities"); - registerDoubleOption_("a", "", 0.0, "Probability with which a present protein emits an associated peptide (to be used jointly with the -A option). Set by grid search if not specified.", false); - registerDoubleOption_("b", "", 0.0, "Probability of the creation of a peptide from noise (to be used jointly with the -A option). Set by grid search if not specified", false); - registerDoubleOption_("G", "", 0.0, "Prior probability of that a protein is present in the sample ( to be used with the -A option). Set by grid search if not specified", false); - registerFlag_("g", "treat ties as if it were one protein (Only valid if option -A is active)."); - registerFlag_("I", "use pi_0 value when calculating empirical q-values (no effect if option Q is activated) (Only valid if option -A is active)."); - registerFlag_("q", "output empirical q-values and p-values (from target-decoy analysis) (Only valid if option -A is active)."); - registerFlag_("N", "disactivates the grouping of proteins with similar connectivity, for example if proteins P1 and P2 have the same peptides matching both of them, P1 and P2 will not be grouped as one protein (Only valid if option -A is active)."); - registerFlag_("E", "Proteins graph will not be separated in sub-graphs (Only valid if option -A is active)."); - registerFlag_("C", "it does not prune peptides with a very low score (~0.0) which means that if a peptide with a very low score is matching two proteins, when we prune the peptide,it will be duplicated to generate two new protein groups (Only valid if option -A is active)."); - registerIntOption_("d", "", 0, "Setting depth 0 or 1 or 2 from low depth to high depth(less computational time) of the grid search for the estimation Alpha,Beta and Gamma parameters for fido(Only valid if option -A is active). Default value is 0", false); - registerStringOption_("P", "", "random", "Define the text pattern to identify the decoy proteins and/or PSMs, set this up if the label that identifies the decoys in the database is not the default (by default : random) (Only valid if option -A is active).", false); - registerFlag_("T", "Reduce the tree of proteins (removing low scored proteins) in order to estimate alpha,beta and gamma faster.(Only valid if option -A is active)."); - registerFlag_("Y", "Use target decoy competition to compute peptide probabilities.(recommended when using -A)."); - registerFlag_("H", "Q-value threshold that will be used in the computation of the MSE and ROC AUC score in the grid search (recommended 0.05 for normal size datasets and 0.1 for big size datasets).(Only valid if option -A is active)."); - registerFlag_("fido-truncation", "Proteins with a very low score (< 0.001) will be truncated (assigned 0.0 probability).(Only valid if option -A is active)"); - registerFlag_("Q", "Uses protein group level inference, each cluster of proteins is either present or not, therefore when grouping proteins discard all possible combinations for each group.(Only valid if option -A is active and -N is inactive)."); + registerStringOption_("enzyme", "", "trypsin", "Type of enzyme: no_enzyme,elastase,pepsin,proteinasek,thermolysin,chymotrypsin,lys-n,lys-c,arg-c,asp-n,glu-c,trypsin", false, true); + registerInputFile_("percolator_executable", "", + // choose the default value according to the platform where it will be executed + #if defined(__APPLE__) + "percolator", + #else + "percolator.exe", + #endif + "Percolator executable of the installation e.g. 'percolator.exe'", true, false, ListUtils::create("skipexists") + ); + + //Advanced parameters +// //registerOutputFile_("r", "", "out", "Output tab delimited results to a file instead of stdout", false, true); + registerOutputFile_("X", "", "", "path to file in xml-output format (pout). Default is: pout.tab", false, true); + registerFlag_("e", "read xml-input format (pin) from standard input", true); + registerFlag_("Z", "Include decoys (PSMs, peptides and/or proteins) in the xml-output. Only available if -X is used.", true); + registerDoubleOption_("p", "", 0.0, "Cpos, penalty for mistakes made on positive examples. Set by cross validation if not specified.", false, true); + registerDoubleOption_("n", "", 0.0, "Cneg, penalty for mistakes made on negative examples. Set by cross validation if not specified.", false, true); + registerDoubleOption_("F", "", 0.01, "False discovery rate threshold to define positive examples in training. Set by cross validation if 0. Default is 0.01.", false, true); + registerDoubleOption_("t", "", 0.01, "False discovery rate threshold for evaluating best cross validation result and the reported end result. Default is 0.01.", false, true); + registerIntOption_("i", "", 0, "Maximal number of iterations", false, true); + registerFlag_("x", "Quicker execution by reduced internal cross-validation.", true); + registerDoubleOption_("f", "", 0.6, "Fraction of the negative data set to be used as train set when only providing one negative set, remaining examples will be used as test set. Set to 0.6 by default.", false, true); + registerOutputFile_("J", "", "", "Output the computed features to the given file in tab-delimited format. A file with the features with the given file name will be created", false, true); + registerInputFile_("k", "", "", "Input file given in the deprecated pin-xml format generated by e.g. sqt2pin with the -k option", false, true); + registerOutputFile_("w", "", "", "Output final weights to the given file", false, true); + registerInputFile_("W", "", "", "Read initial weights to the given file", false, true); + registerStringOption_("V", "", "", "The most informative feature given as the feature name, can be negated to indicate that a lower value is better.", false, true); + registerIntOption_("v", "", 2, "Set verbosity of output: 0=no processing info, 5=all, default is 2", false, true); + registerFlag_("u", "Use unit normalization [0-1] instead of standard deviation normalization", true); + registerFlag_("R", "Measure performance on test set each iteration", true); + registerFlag_("O", "Override error check and do not fall back on default score vector in case of suspect score vector", true); + registerIntOption_("S", "", 1, "Setting seed of the random number generator. Default value is 1", false, true); + registerFlag_("K", "Retention time features calculated as in Klammer et al.", true); + registerFlag_("D", "Include description of correct features", true); + registerOutputFile_("B", "", "", "Output tab delimited results for decoys into a file", false, true); + registerFlag_("U", "Do not remove redundant peptides, keep all PSMS and exclude peptide level probabilities.", true); + registerFlag_("s", "skip validation of input file against xml schema", true); + registerFlag_("A", "output protein level probabilities", true); + registerDoubleOption_("a", "", 0.0, "Probability with which a present protein emits an associated peptide (to be used jointly with the -A option). Set by grid search if not specified.", false, true); + registerDoubleOption_("b", "", 0.0, "Probability of the creation of a peptide from noise (to be used jointly with the -A option). Set by grid search if not specified", false, true); + registerDoubleOption_("G", "", 0.0, "Prior probability of that a protein is present in the sample ( to be used with the -A option). Set by grid search if not specified", false, true); + registerFlag_("g", "treat ties as if it were one protein (Only valid if option -A is active).", true); + registerFlag_("I", "use pi_0 value when calculating empirical q-values (no effect if option Q is activated) (Only valid if option -A is active).", true); + registerFlag_("q", "output empirical q-values and p-values (from target-decoy analysis) (Only valid if option -A is active).", true); + registerFlag_("N", "disactivates the grouping of proteins with similar connectivity, for example if proteins P1 and P2 have the same peptides matching both of them, P1 and P2 will not be grouped as one protein (Only valid if option -A is active).", true); + registerFlag_("E", "Proteins graph will not be separated in sub-graphs (Only valid if option -A is active).", true); + registerFlag_("C", "it does not prune peptides with a very low score (~0.0) which means that if a peptide with a very low score is matching two proteins, when we prune the peptide,it will be duplicated to generate two new protein groups (Only valid if option -A is active).", true); + registerIntOption_("d", "", 0, "Setting depth 0 or 1 or 2 from low depth to high depth(less computational time) of the grid search for the estimation Alpha,Beta and Gamma parameters for fido(Only valid if option -A is active). Default value is 0", false, true); + registerStringOption_("P", "", "random", "Define the text pattern to identify the decoy proteins and/or PSMs, set this up if the label that identifies the decoys in the database is not the default (by default : random) (Only valid if option -A is active).", false, true); + registerFlag_("T", "Reduce the tree of proteins (removing low scored proteins) in order to estimate alpha,beta and gamma faster.(Only valid if option -A is active).", true); + registerFlag_("Y", "Use target decoy competition to compute peptide probabilities.(recommended when using -A).", true); + registerFlag_("H", "Q-value threshold that will be used in the computation of the MSE and ROC AUC score in the grid search (recommended 0.05 for normal size datasets and 0.1 for big size datasets).(Only valid if option -A is active).", true); + registerFlag_("fido-truncation", "Proteins with a very low score (< 0.001) will be truncated (assigned 0.0 probability).(Only valid if option -A is active)", true); + registerFlag_("Q", "Uses protein group level inference, each cluster of proteins is either present or not, therefore when grouping proteins discard all possible combinations for each group.(Only valid if option -A is active and -N is inactive).", true); } ExitCodes main_(int, const char**) @@ -389,390 +630,140 @@ class TOPPPercolator : //------------------------------------------------------------- vector peptide_ids; vector protein_ids; - vector peptide_ids_d; - vector protein_ids_d; //------------------------------------------------------------- - // parsing parameters and crashing if mandatory parameters are missing + // parsing parameters //------------------------------------------------------------- - String inputfile_target_name = getStringOption_("in_target").toQString().toStdString(); - writeDebug_(String("Input file of target: ") + inputfile_target_name, 1); - if (inputfile_target_name == "") + const String in = getStringOption_("in"); + const String in_decoy = getStringOption_("in_decoy"); + writeDebug_(String("Input file of target: ") + in + " " + in_decoy, 2); + + const String percolator_executable(getStringOption_("percolator_executable")); + writeDebug_(String("Path to the percolator: ") + percolator_executable, 2); + if (percolator_executable.empty()) //TODO TOPPBase::findExecutable { - writeLog_("No target input file specified. Aborting!"); + writeLog_("No percolator executable specified. Aborting!"); printUsage_(); return ILLEGAL_PARAMETERS; } - String inputfile_decoy_name = getStringOption_("in_decoy").toQString().toStdString(); - writeDebug_(String("Input file of decoy: ") + inputfile_decoy_name, 1); - if (inputfile_decoy_name == "") + //------------------------------------------------------------- + // read input + //------------------------------------------------------------- + FileHandler fh; + FileTypes::Type in_type = fh.getType(in); + if (in_type == FileTypes::IDXML) + { + IdXMLFile().load(in, protein_ids, peptide_ids); + } + else if (in_type == FileTypes::MZIDENTML) + { + LOG_WARN << "Converting from mzid: you might experience loss of information depending on the capabilities of the target format." << endl; + MzIdentMLFile().load(in, protein_ids, peptide_ids); + } + //else catched by TOPPBase:registerInput being mandatory mzid or idxml + + if (peptide_ids.empty()) { - writeLog_("No decoy input file specified. Aborting!"); + writeLog_("No or empty input file specified. Aborting!"); printUsage_(); return ILLEGAL_PARAMETERS; } - String percolator_executable(getStringOption_("percolator_executable")); - writeDebug_(String("Path to the percolator: ") + percolator_executable, 1); - if (percolator_executable == "") //TODO TOPPBase::findExecutable + if (peptide_ids.empty()) { - writeLog_("No path to percolator specified. Aborting!"); + writeLog_("No or empty input file specified. Aborting!"); printUsage_(); return ILLEGAL_PARAMETERS; } - // get the file extension of the input files to start the correct converter - vector input_target_file; - vector input_decoy_file; - inputfile_target_name.split('.', input_target_file); - String data_target = input_target_file[input_target_file.size() - 1]; - inputfile_decoy_name.split('.', input_decoy_file); - String data_decoy = input_decoy_file[input_decoy_file.size() - 1]; - - TextFile txt; - char out_sep = '\t'; - - //get Information about database search - String datab = ""; - - // converter for MSGF+ & Mascot Files - if (data_target == "mzid" && data_decoy == "mzid") + //------------------------------------------------------------- + // read more input if necessary + //------------------------------------------------------------- + //TODO check if this comes from the same search engine! + if (!in_decoy.empty()) { - datab = "MSGF+"; - - // TODO FOR FUTURE DEVELOPMENT: check out without explicit parameter setting if input file is target or decoy!!! - // Both input files are read in - MzIdentMLFile().load(inputfile_target_name, protein_ids, peptide_ids); - MzIdentMLFile().load(inputfile_decoy_name, protein_ids_d, peptide_ids_d); - LOG_INFO << "Using IDs from" << protein_ids.back().getSearchEngine() << endl; - - // Open File and check if the Identifier is MSGF+ - if (peptide_ids.front().getIdentifier() == "MS-GF+" && peptide_ids_d.front().getIdentifier() == "MS-GF+") + vector decoy_peptide_ids; + vector decoy_protein_ids; + FileTypes::Type in_decoy_type = fh.getType(in_decoy); + if (in_decoy_type == FileTypes::IDXML) { - - // Find out how many possible charges are available - int maxCharge = 0; - int minCharge = 10; - for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) - { - for (vector::const_iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) - { - if (hit->getCharge() > maxCharge) - { - maxCharge = hit->getCharge(); - } - if (hit->getCharge() < minCharge) - { - minCharge = hit->getCharge(); - } - } - } - - // Create String of the charges for the header of the tab file - stringstream ss; - ss << "Charge" << minCharge << ", "; - for (int j = minCharge + 1; j < maxCharge + 1; j++) - { - ss << "Charge" << j << ","; - } - - // Create header for the features - string featureset = "SpecId, Label,ScanNr, RawScore, DeNovoScore,ScoreRatio, Energy,lnEValue,IsotopeError, lnExplainedIonCurrentRatio,lnNTermIonCurrentRatio,lnCTermIonCurrentRatio,lnMS2IonCurrent,Mass,PepLen,dM,absdM,MeanErrorTop7,sqMeanErrorTop7,StdevErrorTop7," + ss.str() + "enzN,enzC,enzInt,Peptide,Proteins"; - StringList txt_header0 = ListUtils::create(featureset); - txt.addLine(ListUtils::concatenate(txt_header0, out_sep)); - LOG_INFO << "consuming target file" << endl; - // get all the features from the target file - preparePIN(peptide_ids, false, txt, minCharge, maxCharge); - LOG_INFO << "consuming decoy file" << endl; - // get all the features from the decoy file - preparePIN(peptide_ids_d, true, txt, minCharge, maxCharge); + IdXMLFile().load(in_decoy, decoy_protein_ids, decoy_peptide_ids); } - else if (peptide_ids.front().getIdentifier() == "Mascot" && peptide_ids_d.front().getIdentifier() == "Mascot") + else if (in_decoy_type == FileTypes::MZIDENTML) { - // TODO: Mascot Implementation + LOG_WARN << "Converting from mzid: you might experience loss of information depending on the capabilities of the target format." << endl; + MzIdentMLFile().load(in_decoy, decoy_protein_ids, decoy_peptide_ids); } - } - // converter for XTandem-Files - // TODO IN FUTURE DEVELOPMENT: IMPLEMENT MZID READER FOR XTANDEMFILES - else if (data_target == "idXML" && data_decoy == "idXML") - { - datab = "XTANDEM"; - IdXMLFile file; - IdXMLFile decoy_file; - file.load(getStringOption_("in_target"), protein_ids, peptide_ids); - decoy_file.load(getStringOption_("in_decoy"), protein_ids_d, peptide_ids_d); - - // Find out how many possible charges are available - int maxCharge = 0; - int minCharge = 10; - - for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + for (std::vector::iterator pit = decoy_peptide_ids.begin(); pit != decoy_peptide_ids.end(); ++pit) { - for (vector::const_iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) + for (std::vector::iterator pht = pit->getHits().begin(); pht != pit->getHits().end(); ++pht) { - if (hit->getCharge() > maxCharge) - { - maxCharge = hit->getCharge(); - } - if (hit->getCharge() < minCharge) - { - minCharge = hit->getCharge(); - } + pht->setMetaValue("target_decoy", "decoy"); + //TODO what about proteins - internal target decoy handling is shitty - rework } } + //TODO this is going to fail with specrum_reference clashes + peptide_ids.insert( peptide_ids.end(), decoy_peptide_ids.begin(), decoy_peptide_ids.end() ); + protein_ids.insert( protein_ids.end(), decoy_protein_ids.begin(), decoy_protein_ids.end() ); + writeLog_("Using decoy hits from separate file."); + } + else + { + writeLog_("Using decoy hits from input id file. You did you use a target decoy search, did you?"); +// printUsage_(); +// return ILLEGAL_PARAMETERS; + } - // Create String of the charges for the header of the tab file - stringstream ss; - ss << "Charge" << minCharge << ", "; - for (int j = minCharge + 1; j < maxCharge + 1; j++) - { - - ss << "Charge" << j << ","; - } - // Find out which ions are in XTandem-File and take only these as features - stringstream ss_ion; - if (peptide_ids.front().getHits().front().getMetaValue("a_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("a_ions").toString() != "") - { - ss_ion << "frac_ion_a" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("b_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("b_ions").toString() != "") - { - ss_ion << "frac_ion_b" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("c_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("c_ions").toString() != "") - { - ss_ion << "frac_ion_c" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("x_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("x_ions").toString() != "") - { - ss_ion << "frac_ion_x" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("y_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("y_ions").toString() != "") - { - ss_ion << "frac_ion_y" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("z_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("z_ions").toString() != "") - { - ss_ion << "frac_ion_z" << ","; - } - - // Create header for the features - String featureset = "SpecId,Label,ScanNr,hyperscore,deltascore," + ss_ion.str() + - ",Mass,dM,absdM,PepLen," + ss.str() + "enzN,enzC,enzInt,Peptide,Proteins"; - StringList txt_header0 = ListUtils::create(featureset); - // Insert the header with the features names to the file - txt.addLine(ListUtils::concatenate(txt_header0, out_sep)); + //------------------------------------------------------------- + // extract search engine and prepare pin + //------------------------------------------------------------- + String se = protein_ids.front().getSearchEngine(); + TextFile txt; - LOG_INFO << "read in target file" << endl; - // get all the features from the target file - for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + //TODO introduce min/max charge to parameters for now take available range + int maxCharge = 0; + int minCharge = 10; + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + { + for (vector::const_iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) { - if (it->isHigherScoreBetter()) + if (hit->getCharge() > maxCharge) { - //TODO this must be spectrum_reference!!! parse spectrum number from there if necessary! - String scannumber = String(it->getMetaValue("spectrum_id")); - int charge = it->getHits().front().getCharge(); - int label = 1; - double hyperscore = it->getHits().front().getScore(); - // deltascore = hyperscore - nextscore - double deltascore = hyperscore - it->getHits().front().getMetaValue("nextscore").toString().toDouble(); - String sequence = it->getHits().front().getSequence().toString(); - int length = sequence.length(); - - // Find out correct ion types and get its Values - stringstream ss_ion_2; - - if (it->getHits().front().getMetaValue("a_score").toString() != "" && - it->getHits().front().getMetaValue("a_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("a_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("b_score").toString() != "" && - it->getHits().front().getMetaValue("b_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("b_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("c_score").toString() != "" && - it->getHits().front().getMetaValue("c_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("c_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("x_score").toString() != "" && - it->getHits().front().getMetaValue("x_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("x_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("y_score").toString() != "" && - it->getHits().front().getMetaValue("y_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("y_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("z_score").toString() != "" && - it->getHits().front().getMetaValue("z_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("z_ions")) / length << out_sep; - } - double mass = it->getHits().front().getMetaValue("mass"); - double dm = it->getHits().front().getMetaValue("delta"); - double mh = mass + dm; - double absdM = abs(dm); - - // write 1 for the correct charge, 0 for other charges - // i.e.: charge 3 for charges from 2-5: 0 1 0 0 - stringstream ss; - int i = minCharge; - while (i <= maxCharge) - { - if (charge != i) - { - ss << "0" << out_sep; - } - if (charge == i) - { - ss << "1" << out_sep; - } - i++; - } - - char aaBefore = it->getHits().front().getPeptideEvidences().front().getAABefore(); - char aaAfter = it->getHits().front().getPeptideEvidences().front().getAAAfter(); - - String peptide = aaBefore + string(".") + sequence + string(".") + aaAfter; - - // formula taken from percolator converter isEnz(n, c) for trypsin - bool enzN = isEnz(peptide.at(0), peptide.at(2), getStringOption_("enzyme")); - bool enzC = isEnz(peptide.at(peptide.size() - 3), peptide.at(peptide.size() - 1), getStringOption_("enzyme")); - int enzInt = countEnzymatic(sequence, getStringOption_("enzyme")); - String protein = it->getHits().front().getPeptideEvidences().front().getProteinAccession(); - - // One PeptideSpectrumHit with all its features - String lis = "_tandem_output_file_target_" + scannumber + "_" + String(charge) + - "_1" + out_sep + String(label) + out_sep + scannumber + out_sep + String(hyperscore) + - out_sep + String(deltascore) + out_sep + ss_ion_2.str() + String(mh) + out_sep + - String(dm) + out_sep + String(absdM) + out_sep + String(length) + out_sep + String(ss.str()) + - String(enzN) + out_sep + String(enzC) + out_sep + String(enzInt) + out_sep + peptide + out_sep + protein; - - // peptide Spectrum Hit pushed to the output file - txt.addLine(lis); + maxCharge = hit->getCharge(); } - } - - LOG_INFO << "read in decoy file" << endl; - // get all the features from the decoy file - for (vector::iterator it = peptide_ids_d.begin(); it != peptide_ids_d.end(); ++it) - { - if (it->isHigherScoreBetter()) + if (hit->getCharge() < minCharge) { - String scannumber = String(it->getMetaValue("spectrum_id")); - int charge = it->getHits().front().getCharge(); - int label = -1; - double hyperscore = it->getHits().front().getScore(); - // deltascore = hyperscore - nextscore - double deltascore = hyperscore - it->getHits().front().getMetaValue("nextscore").toString().toDouble(); - String sequence = it->getHits().front().getSequence().toString(); - int length = sequence.length(); - - // Find out correct ion types and get its Values - stringstream ss_ion_2; - - if (it->getHits().front().getMetaValue("a_score").toString() != "" && it->getHits().front().getMetaValue("a_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("a_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("b_score").toString() != "" && it->getHits().front().getMetaValue("b_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("b_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("c_score").toString() != "" && it->getHits().front().getMetaValue("c_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("c_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("x_score").toString() != "" && it->getHits().front().getMetaValue("x_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("x_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("y_score").toString() != "" && it->getHits().front().getMetaValue("y_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("y_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("z_score").toString() != "" && it->getHits().front().getMetaValue("z_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("z_ions")) / length; - } - double mass = it->getHits().front().getMetaValue("mass"); - double dm = double(it->getHits().front().getMetaValue("delta")); - double mh = mass + dm; - double absdM = abs(dm); - - // write 1 for the correct charge, 0 for other charges - // i.e: charge 3 for charges from 2-5: 0 1 0 0 - stringstream ss; - int i = minCharge; - while (i <= maxCharge) - { - if (charge != i) - { - ss << "0" << out_sep; - } - if (charge == i) - { - ss << "1" << out_sep; - } - i++; - } - - char aaBefore = it->getHits().front().getPeptideEvidences().front().getAABefore(); - char aaAfter = it->getHits().front().getPeptideEvidences().front().getAAAfter(); - - String peptide = aaBefore + string(".") + sequence + string(".") + aaAfter; - - // formula taken from percolator converter isEnz(n, c) for trypsin - bool enzN = isEnz(peptide.at(0), peptide.at(2), getStringOption_("enzyme")); - bool enzC = isEnz(peptide.at(peptide.size() - 3), peptide.at(peptide.size() - 1), getStringOption_("enzyme")); - int enzInt = countEnzymatic(sequence, getStringOption_("enzyme")); - String protein = it->getHits().front().getPeptideEvidences().front().getProteinAccession(); - - // One PeptideSpectrumHit with all its features - String lis = "_tandem_output_file_decoy_" + scannumber + "_" + String(charge) + "_1" + out_sep + String(label) + out_sep + scannumber + out_sep + String(hyperscore) + out_sep + String(deltascore) + out_sep + ss_ion_2.str() + out_sep - + String(mh) + out_sep + String(dm) + out_sep + String(absdM) + out_sep + String(length) + out_sep + ss.str() + out_sep + String(enzN) + out_sep + String(enzC) + out_sep + String(enzInt) + out_sep + peptide + out_sep + protein; - - // peptide Spectrum Hit pushed to the output file - txt.addLine(lis); + minCharge = hit->getCharge(); } } } - else - { - LOG_INFO << "target and decoy files are not of the same type" << endl; - } - LOG_INFO << "Executing percolator" << endl; + writeDebug_("Detected search engine: " + se , 2); + if (se == "MS-GF+") prepareMSGFpin(peptide_ids, txt, minCharge, maxCharge); +// if (se == "Mascot") prepareMASCOTpin(peptide_ids, txt, minCharge, maxCharge); + if (se == "XTandem") prepareXTANDEMpin(peptide_ids, txt, minCharge, maxCharge); + + writeLog_( "Executing percolator!"); // create temp directory to store percolator in file pin.tab temporarily - QDir qdir_temp(File::getTempDirectory().toQString()); - String temp_data_directory = File::getUniqueName(); - qdir_temp.mkdir(temp_data_directory.toQString()); - qdir_temp.cd(temp_data_directory.toQString()); - temp_data_directory = File::getTempDirectory() + "/" + temp_data_directory; - String in_file = temp_data_directory + "/" + File::getUniqueName() + ".tab"; - String out_file = temp_data_directory + "/" + File::getUniqueName() + ".tab"; + String temp_directory_body = QDir::toNativeSeparators((File::getTempDirectory() + "/" + File::getUniqueName() + "/").toQString()); // body for the tmp files + { + QDir d; + d.mkpath(temp_directory_body.toQString()); + } + + String txt_designator = File::getUniqueName(); + String pin_file(temp_directory_body + txt_designator + "_pin.tab"); + String pout_file(temp_directory_body + txt_designator + "_pout.tab"); // File is stored in temp directory - txt.store(in_file); + txt.store(pin_file); - QProcess process; QStringList arguments; - // Check all set parameters and get them into arguments StringList - arguments << "-r" << out_file.toQString(); + arguments << "-r" << pout_file.toQString(); if (getFlag_("e")) arguments << "-e"; if (getFlag_("Z")) arguments << "-Z"; if (getDoubleOption_("p") != 0.0) arguments << "-p" << String(getDoubleOption_("p")).toQString(); @@ -815,18 +806,33 @@ class TOPPPercolator : if (getFlag_("fido-truncation")) arguments << "--fido-truncation"; if (getFlag_("Q")) arguments << "-Q"; arguments << "-U"; - arguments << in_file.toQString(); + arguments << pin_file.toQString(); // Percolator execution with the executable ant the arguments StringList - process.execute(percolator_executable.toQString(), arguments); // does automatic escaping etc... + int status = QProcess::execute(percolator_executable.toQString(), arguments); // does automatic escaping etc... + if (status != 0) + { + writeLog_("Percolator problem. Aborting! Calling command was: '" + percolator_executable + " \"" + arguments.join("-").toStdString() + "\"."); + // clean temporary files + if (this->debug_level_ < 2) + { + File::removeDirRecursively(temp_directory_body); + LOG_WARN << "Set debug level to >=2 to keep the temporary files at '" << temp_directory_body << "'" << std::endl; + } + else + { + LOG_WARN << "Keeping the temporary files at '" << temp_directory_body << "'. Set debug level to <2 to remove them." << std::endl; + } + return EXTERNAL_PROGRAM_ERROR; + } // when percolator finished calculation, it stores the results -r option (with or without -U) or -m (which seems to be not working) - CsvFile csv_file(out_file, '\t'); + CsvFile csv_file(pout_file, '\t'); map > pep_map; StringList row; - for (UInt i = 1; i < csv_file.rowCount(); ++i) + for (size_t i = 1; i < csv_file.rowCount(); ++i) { csv_file.getRow(i, row); vector row_values; @@ -841,34 +847,46 @@ class TOPPPercolator : vector substr; row[0].split('_', substr); +// writeDebug_("Mapping input to key: " + substr[2] , 2); pep_map[substr[2]] = row_values; // scannr. as written in preparePIN } + // As the percolator output file is not needed anymore, the temporary directory is going to be deleted + if (this->debug_level_ < 99) + { + File::removeDirRecursively(temp_directory_body); + } + else + { + LOG_WARN << "Keeping the temporary files at '" << temp_directory_body << "'. Set debug level to <2 to remove them." << std::endl; + } // Add the percolator results to the peptide vector of the original input file + size_t c_debug = 0; for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) { - String sid = it->getMetaValue("spectrum_id"); + String sid = it->getMetaValue("spectrum_reference"); if (pep_map.find(sid) == pep_map.end()) { + //writeDebug_("No suitable PeptideIdentification entry 1st found for " + sid , 2); vector sr; sid.split('=', sr); sid = sr.back(); if (pep_map.find(sid) == pep_map.end()) { - //no spectrum found - log? + writeDebug_("No suitable PeptideIdentification entry 2nd found for " + sid + " - emulate percolator scores with exisiting scores?", 111); + ++c_debug; continue; } } it->setScoreType("q-value"); it->setHigherScoreBetter(false); - vector temp; - swap(temp, it->getHits()); - for (vector::iterator hit = temp.begin(); hit != temp.end(); ++hit) + AASequence aat; + aat.fromString(pep_map[sid][0]); + + for (vector::iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) { - AASequence aat; - aat.fromString(pep_map[sid][0]); if (hit->getSequence() == aat) { //get aa before/after/charge and metainfo @@ -877,16 +895,15 @@ class TOPPPercolator : hit->setMetaValue("MS:1001491", qv); hit->setScore(qv); hit->setMetaValue("MS:1001493", pep_map[sid][3].toDouble()); //pep - hit->setSequence(aat); - it->insertHit(*hit); } } - // TODO what with those not in percolator result file -> empty PeptideHit vector? } + writeDebug_("No suitable PeptideIdentification for " + String(c_debug) + " out of " + String(peptide_ids.size()), 2); for (vector::iterator it = protein_ids.begin(); it != protein_ids.end(); ++it) { it->setSearchEngine("Percolator"); + //TODO add software percolator and topperc } // Storing the PeptideHits with calculated q-value, pep and svm score @@ -894,9 +911,6 @@ class TOPPPercolator : LOG_INFO << "TopPerc finished successfully!" << endl; - // As the percolator output file is not needed anymore, the temporary directory is going to be deleted -// File::removeDirRecursively(temp_data_directory); - return EXECUTION_OK; } From 9dd939c89fd209a5ee3f48a5cd231005d390522c Mon Sep 17 00:00:00 2001 From: mwalzer Date: Mon, 1 Jun 2015 10:33:36 +0200 Subject: [PATCH 06/41] [NOP] fileheader and cmake files --- src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake | 1 + src/openms/source/ANALYSIS/ID/sources.cmake | 1 + 2 files changed, 2 insertions(+) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake b/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake index d17134f8bb3..6c25f4b381c 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake +++ b/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake @@ -23,6 +23,7 @@ MetaboliteSpectralMatching.h PeptideProteinResolution.h ProtonDistributionModel.h PeptideIndexing.h +TopPerc.h ) ### add path to the filenames diff --git a/src/openms/source/ANALYSIS/ID/sources.cmake b/src/openms/source/ANALYSIS/ID/sources.cmake index 341aefedcef..6aa2d36ebb3 100644 --- a/src/openms/source/ANALYSIS/ID/sources.cmake +++ b/src/openms/source/ANALYSIS/ID/sources.cmake @@ -23,6 +23,7 @@ MetaboliteSpectralMatching.cpp PeptideProteinResolution.cpp ProtonDistributionModel.cpp PeptideIndexing.cpp +TopPerc.cpp ) ### add path to the filenames From b8aa4ede6872cb937b5160c723ad4ff43250585f Mon Sep 17 00:00:00 2001 From: mwalzer Date: Tue, 23 Jun 2015 17:37:35 +0200 Subject: [PATCH 07/41] [FEATURE] added a replacement feature for msgf+ mhc ligand identifications --- src/openms/source/ANALYSIS/ID/sources.cmake | 2 +- src/utils/TopPerc.cpp | 96 +++++++++++++++++---- 2 files changed, 79 insertions(+), 19 deletions(-) diff --git a/src/openms/source/ANALYSIS/ID/sources.cmake b/src/openms/source/ANALYSIS/ID/sources.cmake index 6aa2d36ebb3..60bba18c8aa 100644 --- a/src/openms/source/ANALYSIS/ID/sources.cmake +++ b/src/openms/source/ANALYSIS/ID/sources.cmake @@ -23,7 +23,7 @@ MetaboliteSpectralMatching.cpp PeptideProteinResolution.cpp ProtonDistributionModel.cpp PeptideIndexing.cpp -TopPerc.cpp +#TopPerc.cpp ) ### add path to the filenames diff --git a/src/utils/TopPerc.cpp b/src/utils/TopPerc.cpp index afca2d454f3..4faef43fcfa 100644 --- a/src/utils/TopPerc.cpp +++ b/src/utils/TopPerc.cpp @@ -106,7 +106,7 @@ class TOPPPercolator : } protected: - void prepareMSGFpin(vector& peptide_ids, TextFile& txt, int minCharge, int maxCharge, char out_sep='\t') + void prepareMSGFpin(vector& peptide_ids, TextFile& txt, int minCharge, int maxCharge, bool addMHC = false, char out_sep='\t') { // Create String of the charges for the header of the tab file stringstream ss; @@ -117,8 +117,24 @@ class TOPPPercolator : } // Create header for the features - string featureset = "SpecId, Label,ScanNr, RawScore, DeNovoScore,ScoreRatio, Energy,lnEValue,IsotopeError, lnExplainedIonCurrentRatio,lnNTermIonCurrentRatio,lnCTermIonCurrentRatio,lnMS2IonCurrent,Mass,PepLen,dM,absdM,MeanErrorTop7,sqMeanErrorTop7,StdevErrorTop7," + ss.str() + "enzN,enzC,enzInt,Peptide,Proteins"; + string featureset = "SpecId, Label,ScanNr, RawScore, DeNovoScore,ScoreRatio, Energy,lnEValue,IsotopeError, lnExplainedIonCurrentRatio,lnNTermIonCurrentRatio,lnCTermIonCurrentRatio,lnMS2IonCurrent,Mass,PepLen,dM,absdM,MeanErrorTop7,sqMeanErrorTop7,StdevErrorTop7," + ss.str() ; StringList txt_header0 = ListUtils::create(featureset); + if (addMHC) + { + txt_header0.push_back("enzN"); + txt_header0.push_back("enzC"); + txt_header0.push_back("MHCLct"); + txt_header0.push_back("Peptide"); + txt_header0.push_back("Protein"); + } + else + { + txt_header0.push_back("enzN"); + txt_header0.push_back("enzC"); + txt_header0.push_back("enzInt"); + txt_header0.push_back("Peptide"); + txt_header0.push_back("Protein"); + } txt.addLine(ListUtils::concatenate(txt_header0, out_sep)); for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) @@ -232,8 +248,28 @@ class TOPPPercolator : (String)lnNTermIonCurrentRatio + out_sep + (String)lnCTermIonCurrentRatio + out_sep + (String)lnMS2IonCurrent + out_sep + (String)expMass + out_sep + (String)pepLen + out_sep + (String)dM + out_sep + (String)absdM + out_sep + (String)meanErrorTop7 + out_sep + (String)sqMeanErrorTop7 + out_sep + (String)stdevErrorTop7 + - out_sep + String(ss.str()) + String(enzN) + out_sep + String(enzC) + out_sep + String(enzInt) + out_sep + - peptide_with_modifications + out_sep + protein + out_sep; + out_sep + String(ss.str()); + if (addMHC) + { + bool suf = false; + static const string arr[] = {"A", "F", "I", "K", "M", "L", "R", "W", "V"}; + vector mhcends (arr, arr + sizeof(arr) / sizeof(arr[0]) ); + for (std::vector::iterator eit = mhcends.begin(); eit != mhcends.end(); ++eit) + { + if (hit->getSequence().toUnmodifiedString().hasSuffix(string(*eit))) + { + suf = true; + break; + } + } + lis = lis + String(enzN) + out_sep + String(enzC) + out_sep + + String(suf) + out_sep + peptide_with_modifications + out_sep + protein + out_sep; + } + else + { + lis = lis + String(enzN) + out_sep + String(enzC) + out_sep + + String(enzInt) + out_sep + peptide_with_modifications + out_sep + protein + out_sep; + } // peptide Spectrum Hit pushed to the output file txt.addLine(lis); @@ -569,10 +605,10 @@ class TOPPPercolator : registerStringOption_("enzyme", "", "trypsin", "Type of enzyme: no_enzyme,elastase,pepsin,proteinasek,thermolysin,chymotrypsin,lys-n,lys-c,arg-c,asp-n,glu-c,trypsin", false, true); registerInputFile_("percolator_executable", "", // choose the default value according to the platform where it will be executed - #if defined(__APPLE__) - "percolator", - #else + #ifdef OPENMS_WINDOWSPLATFORM "percolator.exe", + #else + "percolator", #endif "Percolator executable of the installation e.g. 'percolator.exe'", true, false, ListUtils::create("skipexists") ); @@ -621,6 +657,7 @@ class TOPPPercolator : registerFlag_("H", "Q-value threshold that will be used in the computation of the MSE and ROC AUC score in the grid search (recommended 0.05 for normal size datasets and 0.1 for big size datasets).(Only valid if option -A is active).", true); registerFlag_("fido-truncation", "Proteins with a very low score (< 0.001) will be truncated (assigned 0.0 probability).(Only valid if option -A is active)", true); registerFlag_("Q", "Uses protein group level inference, each cluster of proteins is either present or not, therefore when grouping proteins discard all possible combinations for each group.(Only valid if option -A is active and -N is inactive).", true); + registerFlag_("MHC", "Add a feature for MHC ligand properties to the specific PSM.", true); } ExitCodes main_(int, const char**) @@ -640,7 +677,7 @@ class TOPPPercolator : const String percolator_executable(getStringOption_("percolator_executable")); writeDebug_(String("Path to the percolator: ") + percolator_executable, 2); - if (percolator_executable.empty()) //TODO TOPPBase::findExecutable + if (percolator_executable.empty()) //TODO? - TOPPBase::findExecutable after registerInputFile_("percolator_executable"... ??? { writeLog_("No percolator executable specified. Aborting!"); printUsage_(); @@ -670,11 +707,26 @@ class TOPPPercolator : return ILLEGAL_PARAMETERS; } - if (peptide_ids.empty()) + //being paranoid about the presence of target decoy denominations, which are crucial to the percolator process + for (std::vector::iterator pit = peptide_ids.begin(); pit != peptide_ids.end(); ++pit) { - writeLog_("No or empty input file specified. Aborting!"); - printUsage_(); - return ILLEGAL_PARAMETERS; + for (vector::iterator pht = pit->getHits().begin(); pht != pit->getHits().end(); ++pht) + { + // Some Hits have no NumMatchedMainIons, and MeanError, etc. values. Have to ignore them! + if (!pht->metaValueExists("target_decoy")) + { + if (!in_decoy.empty()) + { + pht->setMetaValue("target_decoy", "target"); + } + else + { + writeLog_("No target decoy search results discrimination possible. Aborting!"); + printUsage_(); + return ILLEGAL_PARAMETERS; + } + } + } } //------------------------------------------------------------- @@ -700,10 +752,10 @@ class TOPPPercolator : for (std::vector::iterator pht = pit->getHits().begin(); pht != pit->getHits().end(); ++pht) { pht->setMetaValue("target_decoy", "decoy"); - //TODO what about proteins - internal target decoy handling is shitty - rework + //TODO what about proteins - internal target decoy handling is shitty - rework pls } } - //TODO this is going to fail with specrum_reference clashes + //TODO this is going to fail with specrum_reference clashes if not handled _REALLY_ carefully peptide_ids.insert( peptide_ids.end(), decoy_peptide_ids.begin(), decoy_peptide_ids.end() ); protein_ids.insert( protein_ids.end(), decoy_protein_ids.begin(), decoy_protein_ids.end() ); writeLog_("Using decoy hits from separate file."); @@ -741,7 +793,7 @@ class TOPPPercolator : } writeDebug_("Detected search engine: " + se , 2); - if (se == "MS-GF+") prepareMSGFpin(peptide_ids, txt, minCharge, maxCharge); + if (se == "MS-GF+") prepareMSGFpin(peptide_ids, txt, minCharge, maxCharge, getFlag_("MHC")); // if (se == "Mascot") prepareMASCOTpin(peptide_ids, txt, minCharge, maxCharge); if (se == "XTandem") prepareXTANDEMpin(peptide_ids, txt, minCharge, maxCharge); @@ -838,9 +890,10 @@ class TOPPPercolator : vector row_values; // peptide row_values.push_back(row[4].chop(2).reverse().chop(2).reverse()); +// writeDebug_("sequence: " + row[4].chop(2).reverse().chop(2).reverse(), 99); // SVM-score row_values.push_back(row[1]); - // Q-Value + // q-Value row_values.push_back(row[2]); // PEP row_values.push_back(row[3]); @@ -874,8 +927,9 @@ class TOPPPercolator : sid = sr.back(); if (pep_map.find(sid) == pep_map.end()) { - writeDebug_("No suitable PeptideIdentification entry 2nd found for " + sid + " - emulate percolator scores with exisiting scores?", 111); + //writeDebug_("No suitable PeptideIdentification entry 2nd found for " + sid + " - emulate percolator scores with exisiting scores?", 111); ++c_debug; + writeDebug_("No suitable PeptideIdentification entry for " + sid + , 3); continue; } } @@ -883,7 +937,9 @@ class TOPPPercolator : it->setScoreType("q-value"); it->setHigherScoreBetter(false); AASequence aat; - aat.fromString(pep_map[sid][0]); +// writeDebug_("sequence: " + pep_map[sid][0], 99); + aat = AASequence::fromString(pep_map[sid][0]); +// writeDebug_("sequence: " + aat.toString(), 99); for (vector::iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) { @@ -893,9 +949,13 @@ class TOPPPercolator : hit->setMetaValue("MS:1001492", pep_map[sid][1].toDouble()); //svm score double qv = pep_map[sid][2].toDouble(); // q-value hit->setMetaValue("MS:1001491", qv); + hit->setMetaValue("prepercolatorscore", hit->getScore()); + writeDebug_("found peptide and wrote percolator scoring from "+String(hit->getScore())+" to "+String(qv), 99); hit->setScore(qv); hit->setMetaValue("MS:1001493", pep_map[sid][3].toDouble()); //pep + //writeDebug_("found peptide and wrote percolator scoring", 99); } + else writeDebug_(aat.toString()+" - found nothing and wrote no percolator scoring", 99); } } writeDebug_("No suitable PeptideIdentification for " + String(c_debug) + " out of " + String(peptide_ids.size()), 2); From b442dc88a7775cab9a605e3b2df41b44857cdb49 Mon Sep 17 00:00:00 2001 From: mwalzer Date: Wed, 21 Oct 2015 16:10:55 +0200 Subject: [PATCH 08/41] [Feature] refactored TopPerc a little bit more --- .../include/OpenMS/ANALYSIS/ID/TopPerc.h | 66 ++ src/openms/source/ANALYSIS/ID/TopPerc.cpp | 577 ++++++++++++++++++ src/openms/source/ANALYSIS/ID/sources.cmake | 2 +- src/utils/TopPerc.cpp | 501 +-------------- 4 files changed, 652 insertions(+), 494 deletions(-) create mode 100644 src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h create mode 100644 src/openms/source/ANALYSIS/ID/TopPerc.cpp diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h new file mode 100644 index 00000000000..45d38f90928 --- /dev/null +++ b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h @@ -0,0 +1,66 @@ +// -------------------------------------------------------------------------- +// OpenMS -- Open-Source Mass Spectrometry +// -------------------------------------------------------------------------- +// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, +// ETH Zurich, and Freie Universitaet Berlin 2002-2015. +// +// This software is released under a three-clause BSD license: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of any author or any participating institution +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// For a full list of authors, refer to the file AUTHORS. +// -------------------------------------------------------------------------- +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING +// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// -------------------------------------------------------------------------- +// $Maintainer: Mathias Walzer $ +// $Authors: Mathias Walzer $ +// -------------------------------------------------------------------------- + +#ifndef OPENMS_ANALYSIS_ID_TOPPERC_H +#define OPENMS_ANALYSIS_ID_TOPPERC_H + +#include +#include +#include +#include + +#include +#include +#include + +namespace OpenMS +{ + class OPENMS_DLLAPI TopPerc + { + public: + static bool isEnz(const char& n, const char& c, std::string& enz); + static void prepareCUSTOMpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, std::vector& user_param_features, char out_sep='\t'); + static void prepareMSGFpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int minCharge, int maxCharge, bool addMHC = false, char out_sep='\t'); + static void prepareXTANDEMpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int minCharge, int maxCharge, char out_sep='\t'); + static size_t countEnzymatic(String peptide, std::string enz); + static double rescaleFragmentFeature(double featureValue, int NumMatchedMainIons); + private: + TopPerc(); + virtual ~TopPerc(); + }; + +} //namespace OpenMS + +#endif //OPENMS_ANALYSIS_ID_TOPPERC_H + diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/TopPerc.cpp new file mode 100644 index 00000000000..1fbd1b72c30 --- /dev/null +++ b/src/openms/source/ANALYSIS/ID/TopPerc.cpp @@ -0,0 +1,577 @@ +// -------------------------------------------------------------------------- +// OpenMS -- Open-Source Mass Spectrometry +// -------------------------------------------------------------------------- +// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, +// ETH Zurich, and Freie Universitaet Berlin 2002-2015. +// +// This software is released under a three-clause BSD license: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of any author or any participating institution +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// For a full list of authors, refer to the file AUTHORS. +// -------------------------------------------------------------------------- +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING +// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// -------------------------------------------------------------------------- +// $Maintainer: Mathias Walzer $ +// $Authors: Mathias Walzer $ +// -------------------------------------------------------------------------- + +#include +#include + +using namespace std; + +namespace OpenMS +{ + + bool TopPerc::isEnz(const char& n, const char& c, string& enz) + { + if (enz == "trypsin") + { + return ((n == 'K' || n == 'R') && c != 'P') || n == '-' || c == '-'; + } + else if (enz == "chymotrypsin") + { + return ((n == 'F' || n == 'W' || n == 'Y' || n == 'L') && c != 'P') || n == '-' || c == '-'; + } + else if (enz == "thermolysin") + { + return ((c == 'A' || c == 'F' || c == 'I' || c == 'L' || c == 'M' + || c == 'V' || (n == 'R' && c == 'G')) && n != 'D' && n != 'E') || n == '-' || c == '-'; + } + else if (enz == "proteinasek") + { + return (n == 'A' || n == 'E' || n == 'F' || n == 'I' || n == 'L' + || n == 'T' || n == 'V' || n == 'W' || n == 'Y') || n == '-' || c == '-'; + } + else if (enz == "pepsin") + { + return ((c == 'F' || c == 'L' || c == 'W' || c == 'Y' || n == 'F' + || n == 'L' || n == 'W' || n == 'Y') && n != 'R') || n == '-' || c == '-'; + } + else if (enz == "elastase") + { + return ((n == 'L' || n == 'V' || n == 'A' || n == 'G') && c != 'P') + || n == '-' || c == '-'; + } + else if (enz == "lys-n") + { + return (c == 'K') + || n == '-' || c == '-'; + } + else if (enz == "lys-c") + { + return ((n == 'K') && c != 'P') + || n == '-' || c == '-'; + } + else if (enz == "arg-c") + { + return ((n == 'R') && c != 'P') + || n == '-' || c == '-'; + } + else if (enz == "asp-n") + { + return (c == 'D') + || n == '-' || c == '-'; + } + else if (enz == "glu-c") + { + return ((n == 'E') && (c != 'P')) + || n == '-' || c == '-'; + } + else + { + return true; + } + } + + void TopPerc::prepareCUSTOMpin(vector& peptide_ids, string& enz, TextFile& txt, vector& user_param_features, char out_sep) + { + // Create header for the features + string min_featureset = "SpecId, Label, ScanNr"; + StringList txt_header = ListUtils::create(min_featureset); + txt_header.insert(txt_header.end(), user_param_features.begin(), user_param_features.end() ); + txt.addLine(ListUtils::concatenate(txt_header, out_sep)); + + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + { + for (vector::const_iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) + { + String spec_ref = it->getMetaValue("spectrum_reference").toString(); + vector scan_id; + spec_ref.split("scan=", scan_id); + String sid = scan_id.back(); + int label = 1; + String SpecId = "target_SII_"; + if ((String(hit->getMetaValue("target_decoy"))).hasSubstring("decoy")) + { + SpecId = "decoy_SII_"; + label = -1; + } + + SpecId += sid + "_" + String(hit->getCharge()); + + StringList collected_feats; + collected_feats.push_back(SpecId); + collected_feats.push_back(label); + collected_feats.push_back(sid); + + for (vector::const_iterator feat = user_param_features.begin(); feat != user_param_features.end(); ++feat) + { + // Some Hits have no NumMatchedMainIons, and MeanError, etc. values. Have to ignore them! + if (hit->metaValueExists(*feat)) + { + collected_feats.push_back(hit->getMetaValue(*feat).toString()); + } + } + if (collected_feats.size() == user_param_features.size()) + { // only if all feats were present add + txt.addLine(ListUtils::concatenate(collected_feats, out_sep)); + } + } + } + } + + void TopPerc::prepareMSGFpin(vector& peptide_ids, string& enz, TextFile& txt, int minCharge, int maxCharge, bool addMHC, char out_sep) + { + // Create String of the charges for the header of the tab file + stringstream ss; + ss << "Charge" << minCharge << ", "; + for (int j = minCharge + 1; j < maxCharge + 1; j++) + { + ss << "Charge" << j << ","; + } + + // Create header for the features + string featureset = "SpecId, Label,ScanNr, RawScore, DeNovoScore,ScoreRatio, Energy,lnEValue,IsotopeError, lnExplainedIonCurrentRatio,lnNTermIonCurrentRatio,lnCTermIonCurrentRatio,lnMS2IonCurrent,Mass,PepLen,dM,absdM,MeanErrorTop7,sqMeanErrorTop7,StdevErrorTop7," + ss.str() ; + StringList txt_header0 = ListUtils::create(featureset); + if (addMHC) + { + txt_header0.push_back("enzN"); + txt_header0.push_back("enzC"); + txt_header0.push_back("MHCLct"); + txt_header0.push_back("Peptide"); + txt_header0.push_back("Protein"); + } + else + { + txt_header0.push_back("enzN"); + txt_header0.push_back("enzC"); + txt_header0.push_back("enzInt"); + txt_header0.push_back("Peptide"); + txt_header0.push_back("Protein"); + } + txt.addLine(ListUtils::concatenate(txt_header0, out_sep)); + + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + { + for (vector::const_iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) + { + // Some Hits have no NumMatchedMainIons, and MeanError, etc. values. Have to ignore them! + if (hit->metaValueExists("NumMatchedMainIons")) + { + // only take features from first ranked entries and only with meanerrortop7 != 0.0 + if (hit->getRank() == 1 && hit->getMetaValue("MeanErrorTop7").toString().toDouble() != 0.0) + { + int rank = hit->getRank(); + int charge = hit->getCharge(); + + String spec_ref = it->getMetaValue("spectrum_reference").toString(); + vector scan_id; + spec_ref.split("scan=", scan_id); + String sid = scan_id.back(); + + int label = 1; + String SpecId = "target_SII_"; + if ((String(hit->getMetaValue("target_decoy"))).hasSubstring("decoy")) + { + SpecId = "decoy_SII_"; + label = -1; + } + + SpecId += sid + "_" + String(rank) + "_" + sid + "_" + String(charge) + "_" + String(rank); + + double rawScore = hit->getMetaValue("MS:1002049").toString().toDouble(); + double denovoScore = hit->getMetaValue("MS:1002050").toString().toDouble(); + + double scoreRatio; + if (denovoScore > 0) + { + scoreRatio = (rawScore / denovoScore); + } + else + { + scoreRatio = rawScore * 10000; + } + + double energy = denovoScore - rawScore; + double ln_eval = -log(hit->getMetaValue("MS:1002053").toString().toDouble()); + int isotopeError = hit->getMetaValue("IsotopeError").toString().toInt(); + double lnExplainedIonCurrentRatio = log(hit->getMetaValue("ExplainedIonCurrentRatio").toString().toDouble() + 0.0001); // @andsi: wtf?! + double lnNTermIonCurrentRatio = log(hit->getMetaValue("NTermIonCurrentRatio").toString().toDouble() + 0.0001); // @andsi: wtf?! + double lnCTermIonCurrentRatio = log(hit->getMetaValue("CTermIonCurrentRatio").toString().toDouble() + 0.0001); // @andsi: wtf?! + double lnMS2IonCurrent = log(hit->getMetaValue("MS2IonCurrent").toString().toDouble()); + double expMass = it->getMZ(); + double calcMass = hit->getMetaValue("calcMZ"); + int pepLen = hit->getSequence().toUnmodifiedString().length(); + double dM = (expMass - (isotopeError * Constants::NEUTRON_MASS_U / charge) - calcMass) / expMass; + double absdM = abs(dM); + double meanErrorTop7 = hit->getMetaValue("MeanErrorTop7").toString().toDouble(); + int NumMatchedMainIons = hit->getMetaValue("NumMatchedMainIons").toString().toInt(); + + double stdevErrorTop7 = 0.0; + if (hit->getMetaValue("StdevErrorTop7").toString() != "NaN") + { + stdevErrorTop7 = hit->getMetaValue("StdevErrorTop7").toString().toDouble(); + if (stdevErrorTop7 == 0.0) + { + stdevErrorTop7 = meanErrorTop7; + } + } + else + { + LOG_WARN << "Stdeverrortop7 is NaN" << endl; + } + + meanErrorTop7 = rescaleFragmentFeature(meanErrorTop7, NumMatchedMainIons); + double sqMeanErrorTop7 = rescaleFragmentFeature(meanErrorTop7 * meanErrorTop7, NumMatchedMainIons); + stdevErrorTop7 = rescaleFragmentFeature(stdevErrorTop7, NumMatchedMainIons); + + // write 1 for the correct charge, 0 for other charges + // i.e.: charge 3 for charges from 2-5: 0 1 0 0 + stringstream ss; + int i = minCharge; + while (i <= maxCharge) + { + if (charge != i) + { + ss << "0" << out_sep; + } + if (charge == i) + { + ss << "1" << out_sep; + } + i++; + } + char aaBefore = hit->getPeptideEvidences().front().getAABefore(); + char aaAfter = hit->getPeptideEvidences().front().getAAAfter(); + + // sequence without modification: "ABC" instead of "ABC[UNIMOD:4]" + String peptide_without_modifications = aaBefore + string(".") + hit->getSequence().toUnmodifiedString() + string(".") + aaAfter; + + // formula taken from percolator msgfplus-converter isEnz(n, c) for trypsin + bool enzN = isEnz(peptide_without_modifications.at(0), peptide_without_modifications.at(2), enz); + bool enzC = isEnz(peptide_without_modifications.at(peptide_without_modifications.size() - 3), peptide_without_modifications.at(peptide_without_modifications.size() - 1), enz); + int enzInt = countEnzymatic(hit->getSequence().toUnmodifiedString(), enz); + + String peptide_with_modifications = aaBefore + string(".") + hit->getSequence().toString() + string(".") + aaAfter; + String protein = hit->getPeptideEvidences().front().getProteinAccession(); + + // One PeptideSpectrumHit with all its features + String lis = SpecId + out_sep + String(label) + out_sep + scan_id[1] + out_sep + (String)rawScore + out_sep + + (String)denovoScore + out_sep + (String)scoreRatio + out_sep + (String)energy + out_sep + (String)ln_eval + + out_sep + (String)isotopeError + out_sep + (String)lnExplainedIonCurrentRatio + out_sep + + (String)lnNTermIonCurrentRatio + out_sep + (String)lnCTermIonCurrentRatio + out_sep + (String)lnMS2IonCurrent + + out_sep + (String)expMass + out_sep + (String)pepLen + out_sep + (String)dM + out_sep + (String)absdM + out_sep + + (String)meanErrorTop7 + out_sep + (String)sqMeanErrorTop7 + out_sep + (String)stdevErrorTop7 + + out_sep + String(ss.str()); + if (addMHC) + { + bool suf = false; + static const string arr[] = {"A", "F", "I", "K", "M", "L", "R", "W", "V"}; + vector mhcends (arr, arr + sizeof(arr) / sizeof(arr[0]) ); + for (std::vector::iterator eit = mhcends.begin(); eit != mhcends.end(); ++eit) + { + if (hit->getSequence().toUnmodifiedString().hasSuffix(string(*eit))) + { + suf = true; + break; + } + } + lis = lis + String(enzN) + out_sep + String(enzC) + out_sep + + String(suf) + out_sep + peptide_with_modifications + out_sep + protein + out_sep; + } + else + { + lis = lis + String(enzN) + out_sep + String(enzC) + out_sep + + String(enzInt) + out_sep + peptide_with_modifications + out_sep + protein + out_sep; + } + + // peptide Spectrum Hit pushed to the output file + txt.addLine(lis); + } + } + } + } + } + + void TopPerc::prepareXTANDEMpin(vector& peptide_ids, string& enz, TextFile& txt, int minCharge, int maxCharge, char out_sep) + { + // Create String of the charges for the header of the tab file + stringstream ss; + ss << "Charge" << minCharge << ", "; + for (int j = minCharge + 1; j < maxCharge + 1; j++) + { + + ss << "Charge" << j << ","; + } + + // Find out which ions are in XTandem-File and take only these as features + stringstream ss_ion; + if (peptide_ids.front().getHits().front().getMetaValue("a_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue("a_ions").toString() != "") + { + ss_ion << "frac_ion_a" << ","; + } + if (peptide_ids.front().getHits().front().getMetaValue("b_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue("b_ions").toString() != "") + { + ss_ion << "frac_ion_b" << ","; + } + if (peptide_ids.front().getHits().front().getMetaValue("c_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue("c_ions").toString() != "") + { + ss_ion << "frac_ion_c" << ","; + } + if (peptide_ids.front().getHits().front().getMetaValue("x_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue("x_ions").toString() != "") + { + ss_ion << "frac_ion_x" << ","; + } + if (peptide_ids.front().getHits().front().getMetaValue("y_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue("y_ions").toString() != "") + { + ss_ion << "frac_ion_y" << ","; + } + if (peptide_ids.front().getHits().front().getMetaValue("z_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue("z_ions").toString() != "") + { + ss_ion << "frac_ion_z" << ","; + } + + // Create header for the features + String featureset = "SpecId,Label,ScanNr,hyperscore,deltascore," + ss_ion.str() + + ",Mass,dM,absdM,PepLen," + ss.str() + "enzN,enzC,enzInt,Peptide,Proteins"; + StringList txt_header0 = ListUtils::create(featureset); + // Insert the header with the features names to the file + txt.addLine(ListUtils::concatenate(txt_header0, out_sep)); + + LOG_INFO << "read in target file" << endl; + // get all the features from the target file + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + { + if (it->isHigherScoreBetter()) + { + String scannumber = String(it->getMetaValue("spectrum_reference")); + int charge = it->getHits().front().getCharge(); + int label = 1; + double hyperscore = it->getHits().front().getScore(); + // deltascore = hyperscore - nextscore + double deltascore = hyperscore - it->getHits().front().getMetaValue("nextscore").toString().toDouble(); + String sequence = it->getHits().front().getSequence().toString(); + int length = sequence.length(); + + // Find out correct ion types and get its Values + stringstream ss_ion_2; + + if (it->getHits().front().getMetaValue("a_score").toString() != "" && + it->getHits().front().getMetaValue("a_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("a_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("b_score").toString() != "" && + it->getHits().front().getMetaValue("b_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("b_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("c_score").toString() != "" && + it->getHits().front().getMetaValue("c_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("c_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("x_score").toString() != "" && + it->getHits().front().getMetaValue("x_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("x_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("y_score").toString() != "" && + it->getHits().front().getMetaValue("y_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("y_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("z_score").toString() != "" && + it->getHits().front().getMetaValue("z_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("z_ions")) / length << out_sep; + } + double mass = it->getHits().front().getMetaValue("mass"); + double dm = it->getHits().front().getMetaValue("delta"); + double mh = mass + dm; + double absdM = abs(dm); + + // write 1 for the correct charge, 0 for other charges + // i.e.: charge 3 for charges from 2-5: 0 1 0 0 + stringstream ss; + int i = minCharge; + while (i <= maxCharge) + { + if (charge != i) + { + ss << "0" << out_sep; + } + if (charge == i) + { + ss << "1" << out_sep; + } + i++; + } + + char aaBefore = it->getHits().front().getPeptideEvidences().front().getAABefore(); + char aaAfter = it->getHits().front().getPeptideEvidences().front().getAAAfter(); + + String peptide = aaBefore + string(".") + sequence + string(".") + aaAfter; + + // formula taken from percolator converter isEnz(n, c) for trypsin + bool enzN = isEnz(peptide.at(0), peptide.at(2), enz); + bool enzC = isEnz(peptide.at(peptide.size() - 3), peptide.at(peptide.size() - 1), enz); + int enzInt = countEnzymatic(sequence, enz); + String protein = it->getHits().front().getPeptideEvidences().front().getProteinAccession(); + + // One PeptideSpectrumHit with all its features + String lis = "_tandem_output_file_target_" + scannumber + "_" + String(charge) + + "_1" + out_sep + String(label) + out_sep + scannumber + out_sep + String(hyperscore) + + out_sep + String(deltascore) + out_sep + ss_ion_2.str() + String(mh) + out_sep + + String(dm) + out_sep + String(absdM) + out_sep + String(length) + out_sep + String(ss.str()) + + String(enzN) + out_sep + String(enzC) + out_sep + String(enzInt) + out_sep + peptide + out_sep + protein; + + // peptide Spectrum Hit pushed to the output file + txt.addLine(lis); + } + } + + LOG_INFO << "read in decoy file" << endl; + // get all the features from the decoy file + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + { + if (it->isHigherScoreBetter()) + { + String scannumber = String(it->getMetaValue("spectrum_reference")); + int charge = it->getHits().front().getCharge(); + int label = -1; + double hyperscore = it->getHits().front().getScore(); + // deltascore = hyperscore - nextscore + double deltascore = hyperscore - it->getHits().front().getMetaValue("nextscore").toString().toDouble(); + String sequence = it->getHits().front().getSequence().toString(); + int length = sequence.length(); + + // Find out correct ion types and get its Values + stringstream ss_ion_2; + + if (it->getHits().front().getMetaValue("a_score").toString() != "" && it->getHits().front().getMetaValue("a_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("a_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("b_score").toString() != "" && it->getHits().front().getMetaValue("b_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("b_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("c_score").toString() != "" && it->getHits().front().getMetaValue("c_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("c_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("x_score").toString() != "" && it->getHits().front().getMetaValue("x_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("x_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("y_score").toString() != "" && it->getHits().front().getMetaValue("y_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("y_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("z_score").toString() != "" && it->getHits().front().getMetaValue("z_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("z_ions")) / length; + } + double mass = it->getHits().front().getMetaValue("mass"); + double dm = double(it->getHits().front().getMetaValue("delta")); + double mh = mass + dm; + double absdM = abs(dm); + + // write 1 for the correct charge, 0 for other charges + // i.e: charge 3 for charges from 2-5: 0 1 0 0 + stringstream ss; + int i = minCharge; + while (i <= maxCharge) + { + if (charge != i) + { + ss << "0" << out_sep; + } + if (charge == i) + { + ss << "1" << out_sep; + } + i++; + } + + char aaBefore = it->getHits().front().getPeptideEvidences().front().getAABefore(); + char aaAfter = it->getHits().front().getPeptideEvidences().front().getAAAfter(); + + String peptide = aaBefore + string(".") + sequence + string(".") + aaAfter; + + // formula taken from percolator converter isEnz(n, c) for trypsin + bool enzN = isEnz(peptide.at(0), peptide.at(2), enz); + bool enzC = isEnz(peptide.at(peptide.size() - 3), peptide.at(peptide.size() - 1), enz); + int enzInt = countEnzymatic(sequence, enz); + String protein = it->getHits().front().getPeptideEvidences().front().getProteinAccession(); + + // One PeptideSpectrumHit with all its features + String lis = "_tandem_output_file_decoy_" + scannumber + "_" + String(charge) + "_1" + out_sep + String(label) + out_sep + scannumber + out_sep + String(hyperscore) + out_sep + String(deltascore) + out_sep + ss_ion_2.str() + out_sep + + String(mh) + out_sep + String(dm) + out_sep + String(absdM) + out_sep + String(length) + out_sep + ss.str() + out_sep + String(enzN) + out_sep + String(enzC) + out_sep + String(enzInt) + out_sep + peptide + out_sep + protein; + + // peptide Spectrum Hit pushed to the output file + txt.addLine(lis); + } + } + } + + // Function taken from Enzyme.h from Percolator + size_t TopPerc::countEnzymatic(String peptide, string enz) + { + size_t count = 0; + for (size_t ix = 1; ix < peptide.size(); ++ix) + { + if (isEnz(peptide[ix - 1], peptide[ix], enz)) + { + ++count; + } + } + return count; + } + + // Function taken from the percolator converter MsgfplusReader + double TopPerc::rescaleFragmentFeature(double featureValue, int NumMatchedMainIons) + { + // Rescale the fragment features to penalize features calculated by few ions + int numMatchedIonLimit = 7; + int numerator = (1 + numMatchedIonLimit) * (1 + numMatchedIonLimit); + int denominator = (1 + (min)(NumMatchedMainIons, numMatchedIonLimit)) * (1 + (min)(NumMatchedMainIons, numMatchedIonLimit)); + return featureValue * ((double)numerator / denominator); + } + +} diff --git a/src/openms/source/ANALYSIS/ID/sources.cmake b/src/openms/source/ANALYSIS/ID/sources.cmake index 60bba18c8aa..6aa2d36ebb3 100644 --- a/src/openms/source/ANALYSIS/ID/sources.cmake +++ b/src/openms/source/ANALYSIS/ID/sources.cmake @@ -23,7 +23,7 @@ MetaboliteSpectralMatching.cpp PeptideProteinResolution.cpp ProtonDistributionModel.cpp PeptideIndexing.cpp -#TopPerc.cpp +TopPerc.cpp ) ### add path to the filenames diff --git a/src/utils/TopPerc.cpp b/src/utils/TopPerc.cpp index 4faef43fcfa..08deb04c1cd 100644 --- a/src/utils/TopPerc.cpp +++ b/src/utils/TopPerc.cpp @@ -43,7 +43,7 @@ #include #include #include - +#include #include #include #include @@ -106,494 +106,6 @@ class TOPPPercolator : } protected: - void prepareMSGFpin(vector& peptide_ids, TextFile& txt, int minCharge, int maxCharge, bool addMHC = false, char out_sep='\t') - { - // Create String of the charges for the header of the tab file - stringstream ss; - ss << "Charge" << minCharge << ", "; - for (int j = minCharge + 1; j < maxCharge + 1; j++) - { - ss << "Charge" << j << ","; - } - - // Create header for the features - string featureset = "SpecId, Label,ScanNr, RawScore, DeNovoScore,ScoreRatio, Energy,lnEValue,IsotopeError, lnExplainedIonCurrentRatio,lnNTermIonCurrentRatio,lnCTermIonCurrentRatio,lnMS2IonCurrent,Mass,PepLen,dM,absdM,MeanErrorTop7,sqMeanErrorTop7,StdevErrorTop7," + ss.str() ; - StringList txt_header0 = ListUtils::create(featureset); - if (addMHC) - { - txt_header0.push_back("enzN"); - txt_header0.push_back("enzC"); - txt_header0.push_back("MHCLct"); - txt_header0.push_back("Peptide"); - txt_header0.push_back("Protein"); - } - else - { - txt_header0.push_back("enzN"); - txt_header0.push_back("enzC"); - txt_header0.push_back("enzInt"); - txt_header0.push_back("Peptide"); - txt_header0.push_back("Protein"); - } - txt.addLine(ListUtils::concatenate(txt_header0, out_sep)); - - for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) - { - for (vector::const_iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) - { - // Some Hits have no NumMatchedMainIons, and MeanError, etc. values. Have to ignore them! - if (hit->metaValueExists("NumMatchedMainIons")) - { - // only take features from first ranked entries and only with meanerrortop7 != 0.0 - if (hit->getRank() == 1 && hit->getMetaValue("MeanErrorTop7").toString().toDouble() != 0.0) - { - int rank = hit->getRank(); - int charge = hit->getCharge(); - - String spec_ref = it->getMetaValue("spectrum_reference").toString(); - vector scan_id; - spec_ref.split("scan=", scan_id); - String sid = scan_id.back(); - - int label = 1; - String SpecId = "target_SII_"; - if ((String(hit->getMetaValue("target_decoy"))).hasSubstring("decoy")) - { - SpecId = "decoy_SII_"; - label = -1; - } - - SpecId += sid + "_" + String(rank) + "_" + sid + "_" + String(charge) + "_" + String(rank); - - double rawScore = hit->getMetaValue("MS:1002049").toString().toDouble(); - double denovoScore = hit->getMetaValue("MS:1002050").toString().toDouble(); - - double scoreRatio; - if (denovoScore > 0) - { - scoreRatio = (rawScore / denovoScore); - } - else - { - scoreRatio = rawScore * 10000; - } - - double energy = denovoScore - rawScore; - double ln_eval = -log(hit->getMetaValue("MS:1002053").toString().toDouble()); - int isotopeError = hit->getMetaValue("IsotopeError").toString().toInt(); - double lnExplainedIonCurrentRatio = log(hit->getMetaValue("ExplainedIonCurrentRatio").toString().toDouble() + 0.0001); // @andsi: wtf?! - double lnNTermIonCurrentRatio = log(hit->getMetaValue("NTermIonCurrentRatio").toString().toDouble() + 0.0001); // @andsi: wtf?! - double lnCTermIonCurrentRatio = log(hit->getMetaValue("CTermIonCurrentRatio").toString().toDouble() + 0.0001); // @andsi: wtf?! - double lnMS2IonCurrent = log(hit->getMetaValue("MS2IonCurrent").toString().toDouble()); - double expMass = it->getMZ(); - double calcMass = hit->getMetaValue("calcMZ"); - int pepLen = hit->getSequence().toUnmodifiedString().length(); - double dM = (expMass - (isotopeError * Constants::NEUTRON_MASS_U / charge) - calcMass) / expMass; - double absdM = abs(dM); - double meanErrorTop7 = hit->getMetaValue("MeanErrorTop7").toString().toDouble(); - int NumMatchedMainIons = hit->getMetaValue("NumMatchedMainIons").toString().toInt(); - - double stdevErrorTop7 = 0.0; - if (hit->getMetaValue("StdevErrorTop7").toString() != "NaN") - { - stdevErrorTop7 = hit->getMetaValue("StdevErrorTop7").toString().toDouble(); - if (stdevErrorTop7 == 0.0) - { - stdevErrorTop7 = meanErrorTop7; - } - } - else - { - LOG_WARN << "Stdeverrortop7 is NaN" << endl; - } - - meanErrorTop7 = rescaleFragmentFeature(meanErrorTop7, NumMatchedMainIons); - double sqMeanErrorTop7 = rescaleFragmentFeature(meanErrorTop7 * meanErrorTop7, NumMatchedMainIons); - stdevErrorTop7 = rescaleFragmentFeature(stdevErrorTop7, NumMatchedMainIons); - - // write 1 for the correct charge, 0 for other charges - // i.e.: charge 3 for charges from 2-5: 0 1 0 0 - stringstream ss; - int i = minCharge; - while (i <= maxCharge) - { - if (charge != i) - { - ss << "0" << out_sep; - } - if (charge == i) - { - ss << "1" << out_sep; - } - i++; - } - char aaBefore = hit->getPeptideEvidences().front().getAABefore(); - char aaAfter = hit->getPeptideEvidences().front().getAAAfter(); - - // sequence without modification: "ABC" instead of "ABC[UNIMOD:4]" - String peptide_without_modifications = aaBefore + string(".") + hit->getSequence().toUnmodifiedString() + string(".") + aaAfter; - - // formula taken from percolator msgfplus-converter isEnz(n, c) for trypsin - bool enzN = isEnz(peptide_without_modifications.at(0), peptide_without_modifications.at(2), getStringOption_("enzyme")); - bool enzC = isEnz(peptide_without_modifications.at(peptide_without_modifications.size() - 3), peptide_without_modifications.at(peptide_without_modifications.size() - 1), getStringOption_("enzyme")); - int enzInt = countEnzymatic(hit->getSequence().toUnmodifiedString(), getStringOption_("enzyme")); - - String peptide_with_modifications = aaBefore + string(".") + hit->getSequence().toString() + string(".") + aaAfter; - String protein = hit->getPeptideEvidences().front().getProteinAccession(); - - // One PeptideSpectrumHit with all its features - String lis = SpecId + out_sep + String(label) + out_sep + scan_id[1] + out_sep + (String)rawScore + out_sep + - (String)denovoScore + out_sep + (String)scoreRatio + out_sep + (String)energy + out_sep + (String)ln_eval + - out_sep + (String)isotopeError + out_sep + (String)lnExplainedIonCurrentRatio + out_sep + - (String)lnNTermIonCurrentRatio + out_sep + (String)lnCTermIonCurrentRatio + out_sep + (String)lnMS2IonCurrent - + out_sep + (String)expMass + out_sep + (String)pepLen + out_sep + (String)dM + out_sep + (String)absdM + out_sep + - (String)meanErrorTop7 + out_sep + (String)sqMeanErrorTop7 + out_sep + (String)stdevErrorTop7 + - out_sep + String(ss.str()); - if (addMHC) - { - bool suf = false; - static const string arr[] = {"A", "F", "I", "K", "M", "L", "R", "W", "V"}; - vector mhcends (arr, arr + sizeof(arr) / sizeof(arr[0]) ); - for (std::vector::iterator eit = mhcends.begin(); eit != mhcends.end(); ++eit) - { - if (hit->getSequence().toUnmodifiedString().hasSuffix(string(*eit))) - { - suf = true; - break; - } - } - lis = lis + String(enzN) + out_sep + String(enzC) + out_sep - + String(suf) + out_sep + peptide_with_modifications + out_sep + protein + out_sep; - } - else - { - lis = lis + String(enzN) + out_sep + String(enzC) + out_sep - + String(enzInt) + out_sep + peptide_with_modifications + out_sep + protein + out_sep; - } - - // peptide Spectrum Hit pushed to the output file - txt.addLine(lis); - } - } - } - } - } - - void prepareXTANDEMpin(vector& peptide_ids, TextFile& txt, int minCharge, int maxCharge, char out_sep='\t') - { - // Create String of the charges for the header of the tab file - stringstream ss; - ss << "Charge" << minCharge << ", "; - for (int j = minCharge + 1; j < maxCharge + 1; j++) - { - - ss << "Charge" << j << ","; - } - - // Find out which ions are in XTandem-File and take only these as features - stringstream ss_ion; - if (peptide_ids.front().getHits().front().getMetaValue("a_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("a_ions").toString() != "") - { - ss_ion << "frac_ion_a" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("b_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("b_ions").toString() != "") - { - ss_ion << "frac_ion_b" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("c_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("c_ions").toString() != "") - { - ss_ion << "frac_ion_c" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("x_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("x_ions").toString() != "") - { - ss_ion << "frac_ion_x" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("y_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("y_ions").toString() != "") - { - ss_ion << "frac_ion_y" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("z_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("z_ions").toString() != "") - { - ss_ion << "frac_ion_z" << ","; - } - - // Create header for the features - String featureset = "SpecId,Label,ScanNr,hyperscore,deltascore," + ss_ion.str() + - ",Mass,dM,absdM,PepLen," + ss.str() + "enzN,enzC,enzInt,Peptide,Proteins"; - StringList txt_header0 = ListUtils::create(featureset); - // Insert the header with the features names to the file - txt.addLine(ListUtils::concatenate(txt_header0, out_sep)); - - LOG_INFO << "read in target file" << endl; - // get all the features from the target file - for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) - { - if (it->isHigherScoreBetter()) - { - String scannumber = String(it->getMetaValue("spectrum_reference")); - int charge = it->getHits().front().getCharge(); - int label = 1; - double hyperscore = it->getHits().front().getScore(); - // deltascore = hyperscore - nextscore - double deltascore = hyperscore - it->getHits().front().getMetaValue("nextscore").toString().toDouble(); - String sequence = it->getHits().front().getSequence().toString(); - int length = sequence.length(); - - // Find out correct ion types and get its Values - stringstream ss_ion_2; - - if (it->getHits().front().getMetaValue("a_score").toString() != "" && - it->getHits().front().getMetaValue("a_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("a_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("b_score").toString() != "" && - it->getHits().front().getMetaValue("b_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("b_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("c_score").toString() != "" && - it->getHits().front().getMetaValue("c_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("c_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("x_score").toString() != "" && - it->getHits().front().getMetaValue("x_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("x_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("y_score").toString() != "" && - it->getHits().front().getMetaValue("y_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("y_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("z_score").toString() != "" && - it->getHits().front().getMetaValue("z_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("z_ions")) / length << out_sep; - } - double mass = it->getHits().front().getMetaValue("mass"); - double dm = it->getHits().front().getMetaValue("delta"); - double mh = mass + dm; - double absdM = abs(dm); - - // write 1 for the correct charge, 0 for other charges - // i.e.: charge 3 for charges from 2-5: 0 1 0 0 - stringstream ss; - int i = minCharge; - while (i <= maxCharge) - { - if (charge != i) - { - ss << "0" << out_sep; - } - if (charge == i) - { - ss << "1" << out_sep; - } - i++; - } - - char aaBefore = it->getHits().front().getPeptideEvidences().front().getAABefore(); - char aaAfter = it->getHits().front().getPeptideEvidences().front().getAAAfter(); - - String peptide = aaBefore + string(".") + sequence + string(".") + aaAfter; - - // formula taken from percolator converter isEnz(n, c) for trypsin - bool enzN = isEnz(peptide.at(0), peptide.at(2), getStringOption_("enzyme")); - bool enzC = isEnz(peptide.at(peptide.size() - 3), peptide.at(peptide.size() - 1), getStringOption_("enzyme")); - int enzInt = countEnzymatic(sequence, getStringOption_("enzyme")); - String protein = it->getHits().front().getPeptideEvidences().front().getProteinAccession(); - - // One PeptideSpectrumHit with all its features - String lis = "_tandem_output_file_target_" + scannumber + "_" + String(charge) + - "_1" + out_sep + String(label) + out_sep + scannumber + out_sep + String(hyperscore) + - out_sep + String(deltascore) + out_sep + ss_ion_2.str() + String(mh) + out_sep + - String(dm) + out_sep + String(absdM) + out_sep + String(length) + out_sep + String(ss.str()) + - String(enzN) + out_sep + String(enzC) + out_sep + String(enzInt) + out_sep + peptide + out_sep + protein; - - // peptide Spectrum Hit pushed to the output file - txt.addLine(lis); - } - } - - LOG_INFO << "read in decoy file" << endl; - // get all the features from the decoy file - for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) - { - if (it->isHigherScoreBetter()) - { - String scannumber = String(it->getMetaValue("spectrum_reference")); - int charge = it->getHits().front().getCharge(); - int label = -1; - double hyperscore = it->getHits().front().getScore(); - // deltascore = hyperscore - nextscore - double deltascore = hyperscore - it->getHits().front().getMetaValue("nextscore").toString().toDouble(); - String sequence = it->getHits().front().getSequence().toString(); - int length = sequence.length(); - - // Find out correct ion types and get its Values - stringstream ss_ion_2; - - if (it->getHits().front().getMetaValue("a_score").toString() != "" && it->getHits().front().getMetaValue("a_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("a_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("b_score").toString() != "" && it->getHits().front().getMetaValue("b_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("b_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("c_score").toString() != "" && it->getHits().front().getMetaValue("c_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("c_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("x_score").toString() != "" && it->getHits().front().getMetaValue("x_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("x_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("y_score").toString() != "" && it->getHits().front().getMetaValue("y_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("y_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("z_score").toString() != "" && it->getHits().front().getMetaValue("z_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("z_ions")) / length; - } - double mass = it->getHits().front().getMetaValue("mass"); - double dm = double(it->getHits().front().getMetaValue("delta")); - double mh = mass + dm; - double absdM = abs(dm); - - // write 1 for the correct charge, 0 for other charges - // i.e: charge 3 for charges from 2-5: 0 1 0 0 - stringstream ss; - int i = minCharge; - while (i <= maxCharge) - { - if (charge != i) - { - ss << "0" << out_sep; - } - if (charge == i) - { - ss << "1" << out_sep; - } - i++; - } - - char aaBefore = it->getHits().front().getPeptideEvidences().front().getAABefore(); - char aaAfter = it->getHits().front().getPeptideEvidences().front().getAAAfter(); - - String peptide = aaBefore + string(".") + sequence + string(".") + aaAfter; - - // formula taken from percolator converter isEnz(n, c) for trypsin - bool enzN = isEnz(peptide.at(0), peptide.at(2), getStringOption_("enzyme")); - bool enzC = isEnz(peptide.at(peptide.size() - 3), peptide.at(peptide.size() - 1), getStringOption_("enzyme")); - int enzInt = countEnzymatic(sequence, getStringOption_("enzyme")); - String protein = it->getHits().front().getPeptideEvidences().front().getProteinAccession(); - - // One PeptideSpectrumHit with all its features - String lis = "_tandem_output_file_decoy_" + scannumber + "_" + String(charge) + "_1" + out_sep + String(label) + out_sep + scannumber + out_sep + String(hyperscore) + out_sep + String(deltascore) + out_sep + ss_ion_2.str() + out_sep - + String(mh) + out_sep + String(dm) + out_sep + String(absdM) + out_sep + String(length) + out_sep + ss.str() + out_sep + String(enzN) + out_sep + String(enzC) + out_sep + String(enzInt) + out_sep + peptide + out_sep + protein; - - // peptide Spectrum Hit pushed to the output file - txt.addLine(lis); - } - } - } - - // Function taken from Enzyme.h from Percolator - bool isEnz(const char& n, const char& c, string enz) - { - if (enz == "trypsin") - { - return ((n == 'K' || n == 'R') && c != 'P') || n == '-' || c == '-'; - } - else if (enz == "chymotrypsin") - { - return ((n == 'F' || n == 'W' || n == 'Y' || n == 'L') && c != 'P') || n == '-' || c == '-'; - } - else if (enz == "thermolysin") - { - return ((c == 'A' || c == 'F' || c == 'I' || c == 'L' || c == 'M' - || c == 'V' || (n == 'R' && c == 'G')) && n != 'D' && n != 'E') || n == '-' || c == '-'; - } - else if (enz == "proteinasek") - { - return (n == 'A' || n == 'E' || n == 'F' || n == 'I' || n == 'L' - || n == 'T' || n == 'V' || n == 'W' || n == 'Y') || n == '-' || c == '-'; - } - else if (enz == "pepsin") - { - return ((c == 'F' || c == 'L' || c == 'W' || c == 'Y' || n == 'F' - || n == 'L' || n == 'W' || n == 'Y') && n != 'R') || n == '-' || c == '-'; - } - else if (enz == "elastase") - { - return ((n == 'L' || n == 'V' || n == 'A' || n == 'G') && c != 'P') - || n == '-' || c == '-'; - } - else if (enz == "lys-n") - { - return (c == 'K') - || n == '-' || c == '-'; - } - else if (enz == "lys-c") - { - return ((n == 'K') && c != 'P') - || n == '-' || c == '-'; - } - else if (enz == "arg-c") - { - return ((n == 'R') && c != 'P') - || n == '-' || c == '-'; - } - else if (enz == "asp-n") - { - return (c == 'D') - || n == '-' || c == '-'; - } - else if (enz == "glu-c") - { - return ((n == 'E') && (c != 'P')) - || n == '-' || c == '-'; - } - else - { - return true; - } - } - - // Function taken from Enzyme.h from Percolator - size_t countEnzymatic(String peptide, string enz) - { - size_t count = 0; - for (size_t ix = 1; ix < peptide.size(); ++ix) - { - if (isEnz(peptide[ix - 1], peptide[ix], enz)) - { - ++count; - } - } - return count; - } - - // Function taken from the percolator converter MsgfplusReader - double rescaleFragmentFeature(double featureValue, int NumMatchedMainIons) - { - // Rescale the fragment features to penalize features calculated by few ions - int numMatchedIonLimit = 7; - int numerator = (1 + numMatchedIonLimit) * (1 + numMatchedIonLimit); - int denominator = (1 + (min)(NumMatchedMainIons, numMatchedIonLimit)) * (1 + (min)(NumMatchedMainIons, numMatchedIonLimit)); - return featureValue * ((double)numerator / denominator); - } - void registerOptionsAndFlags_() { registerInputFile_("percolator_executable", "", "", "Path to the percolator binary", true, false, ListUtils::create("skipexists")); @@ -602,7 +114,9 @@ class TOPPPercolator : registerInputFile_("in_decoy", "", "", "Input decoy file", false); setValidFormats_("in_decoy", ListUtils::create("mzid")); registerOutputFile_("out", "", "", "Output file", true); - registerStringOption_("enzyme", "", "trypsin", "Type of enzyme: no_enzyme,elastase,pepsin,proteinasek,thermolysin,chymotrypsin,lys-n,lys-c,arg-c,asp-n,glu-c,trypsin", false, true); + std::string enzs = "no_enzyme,elastase,pepsin,proteinasek,thermolysin,chymotrypsin,lys-n,lys-c,arg-c,asp-n,glu-c,trypsin"; + registerStringOption_("enzyme", "", "trypsin", "Type of enzyme: "+enzs , false, true); + setValidStrings_("enzyme", ListUtils::create(enzs)); registerInputFile_("percolator_executable", "", // choose the default value according to the platform where it will be executed #ifdef OPENMS_WINDOWSPLATFORM @@ -792,10 +306,11 @@ class TOPPPercolator : } } + std::string enz_str = getStringOption_("enzyme"); writeDebug_("Detected search engine: " + se , 2); - if (se == "MS-GF+") prepareMSGFpin(peptide_ids, txt, minCharge, maxCharge, getFlag_("MHC")); + if (se == "MS-GF+") TopPerc::prepareMSGFpin(peptide_ids, enz_str, txt, minCharge, maxCharge, getFlag_("MHC")); // if (se == "Mascot") prepareMASCOTpin(peptide_ids, txt, minCharge, maxCharge); - if (se == "XTandem") prepareXTANDEMpin(peptide_ids, txt, minCharge, maxCharge); + if (se == "XTandem") TopPerc::prepareXTANDEMpin(peptide_ids, enz_str, txt, minCharge, maxCharge); writeLog_( "Executing percolator!"); @@ -929,7 +444,7 @@ class TOPPPercolator : { //writeDebug_("No suitable PeptideIdentification entry 2nd found for " + sid + " - emulate percolator scores with exisiting scores?", 111); ++c_debug; - writeDebug_("No suitable PeptideIdentification entry for " + sid + , 3); + writeDebug_("No suitable PeptideIdentification entry for " + sid, 3); continue; } } From 72f81c122fe654775220feab9e5e8b550e4c382a Mon Sep 17 00:00:00 2001 From: mwalzer Date: Tue, 24 Nov 2015 16:40:21 +0100 Subject: [PATCH 09/41] [NOP] added SE PIN info for TopPerc --- src/openms/source/ANALYSIS/ID/TopPerc.cpp | 508 ++++++++++++++++++++++ 1 file changed, 508 insertions(+) diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/TopPerc.cpp index 1fbd1b72c30..29ccccc60ef 100644 --- a/src/openms/source/ANALYSIS/ID/TopPerc.cpp +++ b/src/openms/source/ANALYSIS/ID/TopPerc.cpp @@ -550,6 +550,514 @@ namespace OpenMS } } + void TopPerc::prepareCOMETpin(vector& peptide_ids, string& enz, TextFile& txt, int minCharge, int maxCharge, char out_sep) + { + /** -no decoy comet search +id label ScanNr lnrSp deltLCn deltCn lnExpect Xcorr Sp IonFrac Mass PepLen Charge1 Charge2 Charge3 Charge4 Charge5 Charge6 enzN enzC enzInt lnNumSP dM absdM peptide proteinId1 +/home/.../150209_msms4_45_3_1 1 45 2.564949 0.120106 0.058356 1.58511 0.917335 94.189621 0.1875 1541.939549 13 0 0 1 0 0 0 0 0 2 8.659387 -0.000001 0.000001 H.FVIIIRKQTDLPV.I XXX_sp|P30307|MPIP3_HUMAN +/home/.../150209_msms4_55_2_1 1 55 2.484907 0.19764 0.077428 0.919862 0.757954 66.903687 0.375 1087.697313 9 0 1 0 0 0 0 0 0 3 7.689829 0.000002 0.000002 G.TRSLKRLLT.A XXX_sp|Q8IWD5|MFS6L_HUMAN +/home/.../150209_msms4_58_2_1 1 58 2.772589 0.304528 0.119574 0.471976 0.875639 44.227581 0.2222 1086.695849 10 0 1 0 0 0 0 1 0 5 8.254269 -0.000003 0.000003 K.KSAKKTPKKA.K sp|P16403|H12_HUMAN +/home/.../150209_msms4_70_3_1 1 70 1.609438 0.22949 0.161329 0.194216 1.455887 249.314102 0.2708 1399.760349 13 0 0 1 0 0 0 0 1 1 10.29438 0.000009 0.000009 V.KFNGAHIPGSPFK.I sp|Q14315|FLNC_HUMAN +/home/.../150209_msms4_85_3_1 1 85 0 0.330495 0.2621 -1.688895 2.168427 926.547668 0.4545 1412.87949 12 0 0 1 0 0 0 1 0 5 8.278428 -0.000001 0.000001 K.RAKAKTTKKRPQ.R sp|P24844|MYL9_HUMAN +/home/.../150209_msms4_89_3_1 1 89 2.995732 0.105586 0.028294 1.622349 1.383884 216.628876 0.2857 1412.88068 15 0 0 1 0 0 0 0 0 0 8.322637 0.000006 0.000006 I.AVVSVTVLLAISLAG.M sp|P55017|S12A3_HUMAN +/home/.../150209_msms4_93_3_1 1 93 3.931826 0.247239 0.074946 0.447477 1.304711 117.67543 0.2115 1480.967259 14 0 0 1 0 0 0 0 1 4 6.850126 -0.000001 0.000001 P.AKKPKAAKAKKPSK.A XXX_sp|P10412|H14_HUMAN +/home/.../150209_msms4_95_2_1 1 95 0.693147 0.088233 0.011378 1.549132 1.088933 306.906311 0.5 851.43651 8 0 1 0 0 0 0 1 0 0 10.106918 0.000008 0.000008 R.FSPGIPAY.P sp|Q96MF6|CQ10A_HUMAN +/home/.../150209_msms4_97_3_1 1 97 4.007333 0.090959 0.061997 1.593871 1.009504 106.411644 0.1538 1569.985447 14 0 0 1 0 0 0 0 1 4 7.784889 -0.000004 0.000004 T.RRSQALKKLVGSVK.S XXX_sp|P13994|CC130_HUMAN +/home/.../150209_msms4_99_3_1 1 99 0 0.43843 0.227306 -3.148641 2.312463 637.051819 0.4038 1480.967716 14 0 0 1 0 0 0 1 1 5 6.919684 -0.000001 0.000001 K.KAKSPKKAKAAKPK.K sp|P10412|H14_HUMAN +/home/.../150209_msms4_100_3_1 1 100 4.406719 0.091476 0.051751 1.433108 1.358069 157.195969 0.2115 1568.978001 14 0 0 1 0 0 0 0 0 5 7.985484 -0.000007 0.000007 A.KALKGKEPPKKVFV.G sp|O14979|HNRDL_HUMAN +/home/.../150209_msms4_103_3_1 1 103 4.094345 0.20423 0.129201 0.77962 0.981422 40.896797 0.1607 1568.984593 15 0 0 1 0 0 0 0 0 0 7.904704 0.00001 0.00001 N.LLGLIEMILLSVGVV.M sp|P16671|CD36_HUMAN +/home/.../150209_msms4_104_3_1 1 104 3.970292 0.290208 0.198715 -0.32615 1.332213 89.666199 0.1875 1587.912022 13 0 0 1 0 0 0 0 1 3 9.75022 -0.000004 0.000004 T.QDGLFLRRAISRR.Y XXX_sp|Q2LD37|K1109_HUMAN + */ + /** -with decoy comet search +id label ScanNr lnrSp deltLCn deltCn lnExpect Xcorr Sp IonFrac Mass PepLen Charge1 Charge2 Charge3 Charge4 Charge5 Charge6 enzN enzC enzInt lnNumSP dM absdM peptide proteinId1 +/home/.../150209_msms4_45_3_1 1 45 2.890372 0.066992 0.055908 2.212066 0.917335 94.189621 0.1875 1541.939549 13 0 0 1 0 0 0 0 0 2 9.352534 -0.000001 0.000001 H.FVIIIRKQTDLPV.I XXX_sp|P30307|MPIP3_HUMAN +/home/.../150209_msms4_55_2_1 -1 55 2.70805 0.257442 0.142884 0.236914 0.884307 65.903358 0.3125 1087.697313 9 0 1 0 0 0 0 0 0 3 8.382976 0.000002 0.000002 F.TIRRKSLLT.S DECOY_XXX_sp|Q5VYS8|TUT7_HUMAN +/home/.../150209_msms4_58_2_1 -1 58 2.079442 0.189541 0.038707 1.094758 0.910897 67.636864 0.2778 1086.695849 10 0 1 0 0 0 0 1 0 4 8.947416 -0.000003 0.000003 K.SKAKKPTKKA.K DECOY_sp|P16403|H12_HUMAN +/home/.../150209_msms4_70_3_1 1 70 0.693147 0.199883 0.161329 0.813617 1.455887 249.314102 0.2708 1399.760349 13 0 0 1 0 0 0 0 1 1 10.987528 0.000009 0.000009 V.KFNGAHIPGSPFK.I sp|Q14315|FLNC_HUMAN + */ + + // Create String of the charges for the header of the tab file + stringstream ss; + ss << "Charge" << minCharge << ", "; + for (int j = minCharge + 1; j < maxCharge + 1; j++) + { + + ss << "Charge" << j << ","; + } + + // Find out which ions are in XTandem-File and take only these as features + stringstream ss_ion; + if (peptide_ids.front().getHits().front().getMetaValue("a_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue("a_ions").toString() != "") + { + ss_ion << "frac_ion_a" << ","; + } + if (peptide_ids.front().getHits().front().getMetaValue("b_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue("b_ions").toString() != "") + { + ss_ion << "frac_ion_b" << ","; + } + if (peptide_ids.front().getHits().front().getMetaValue("c_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue("c_ions").toString() != "") + { + ss_ion << "frac_ion_c" << ","; + } + if (peptide_ids.front().getHits().front().getMetaValue("x_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue("x_ions").toString() != "") + { + ss_ion << "frac_ion_x" << ","; + } + if (peptide_ids.front().getHits().front().getMetaValue("y_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue("y_ions").toString() != "") + { + ss_ion << "frac_ion_y" << ","; + } + if (peptide_ids.front().getHits().front().getMetaValue("z_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue("z_ions").toString() != "") + { + ss_ion << "frac_ion_z" << ","; + } + + // Create header for the features + String featureset = "SpecId,Label,ScanNr,hyperscore,deltascore," + ss_ion.str() + + ",Mass,dM,absdM,PepLen," + ss.str() + "enzN,enzC,enzInt,Peptide,Proteins"; + StringList txt_header0 = ListUtils::create(featureset); + // Insert the header with the features names to the file + txt.addLine(ListUtils::concatenate(txt_header0, out_sep)); + + LOG_INFO << "read in target file" << endl; + // get all the features from the target file + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + { + if (it->isHigherScoreBetter()) + { + String scannumber = String(it->getMetaValue("spectrum_reference")); + int charge = it->getHits().front().getCharge(); + int label = 1; + double hyperscore = it->getHits().front().getScore(); + // deltascore = hyperscore - nextscore + double deltascore = hyperscore - it->getHits().front().getMetaValue("nextscore").toString().toDouble(); + String sequence = it->getHits().front().getSequence().toString(); + int length = sequence.length(); + + // Find out correct ion types and get its Values + stringstream ss_ion_2; + + if (it->getHits().front().getMetaValue("a_score").toString() != "" && + it->getHits().front().getMetaValue("a_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("a_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("b_score").toString() != "" && + it->getHits().front().getMetaValue("b_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("b_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("c_score").toString() != "" && + it->getHits().front().getMetaValue("c_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("c_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("x_score").toString() != "" && + it->getHits().front().getMetaValue("x_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("x_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("y_score").toString() != "" && + it->getHits().front().getMetaValue("y_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("y_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("z_score").toString() != "" && + it->getHits().front().getMetaValue("z_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("z_ions")) / length << out_sep; + } + double mass = it->getHits().front().getMetaValue("mass"); + double dm = it->getHits().front().getMetaValue("delta"); + double mh = mass + dm; + double absdM = abs(dm); + + // write 1 for the correct charge, 0 for other charges + // i.e.: charge 3 for charges from 2-5: 0 1 0 0 + stringstream ss; + int i = minCharge; + while (i <= maxCharge) + { + if (charge != i) + { + ss << "0" << out_sep; + } + if (charge == i) + { + ss << "1" << out_sep; + } + i++; + } + + char aaBefore = it->getHits().front().getPeptideEvidences().front().getAABefore(); + char aaAfter = it->getHits().front().getPeptideEvidences().front().getAAAfter(); + + String peptide = aaBefore + string(".") + sequence + string(".") + aaAfter; + + // formula taken from percolator converter isEnz(n, c) for trypsin + bool enzN = isEnz(peptide.at(0), peptide.at(2), enz); + bool enzC = isEnz(peptide.at(peptide.size() - 3), peptide.at(peptide.size() - 1), enz); + int enzInt = countEnzymatic(sequence, enz); + String protein = it->getHits().front().getPeptideEvidences().front().getProteinAccession(); + + // One PeptideSpectrumHit with all its features + String lis = "_tandem_output_file_target_" + scannumber + "_" + String(charge) + + "_1" + out_sep + String(label) + out_sep + scannumber + out_sep + String(hyperscore) + + out_sep + String(deltascore) + out_sep + ss_ion_2.str() + String(mh) + out_sep + + String(dm) + out_sep + String(absdM) + out_sep + String(length) + out_sep + String(ss.str()) + + String(enzN) + out_sep + String(enzC) + out_sep + String(enzInt) + out_sep + peptide + out_sep + protein; + + // peptide Spectrum Hit pushed to the output file + txt.addLine(lis); + } + } + + LOG_INFO << "read in decoy file" << endl; + // get all the features from the decoy file + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + { + if (it->isHigherScoreBetter()) + { + String scannumber = String(it->getMetaValue("spectrum_reference")); + int charge = it->getHits().front().getCharge(); + int label = -1; + double hyperscore = it->getHits().front().getScore(); + // deltascore = hyperscore - nextscore + double deltascore = hyperscore - it->getHits().front().getMetaValue("nextscore").toString().toDouble(); + String sequence = it->getHits().front().getSequence().toString(); + int length = sequence.length(); + + // Find out correct ion types and get its Values + stringstream ss_ion_2; + + if (it->getHits().front().getMetaValue("a_score").toString() != "" && it->getHits().front().getMetaValue("a_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("a_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("b_score").toString() != "" && it->getHits().front().getMetaValue("b_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("b_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("c_score").toString() != "" && it->getHits().front().getMetaValue("c_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("c_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("x_score").toString() != "" && it->getHits().front().getMetaValue("x_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("x_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("y_score").toString() != "" && it->getHits().front().getMetaValue("y_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("y_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("z_score").toString() != "" && it->getHits().front().getMetaValue("z_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("z_ions")) / length; + } + double mass = it->getHits().front().getMetaValue("mass"); + double dm = double(it->getHits().front().getMetaValue("delta")); + double mh = mass + dm; + double absdM = abs(dm); + + // write 1 for the correct charge, 0 for other charges + // i.e: charge 3 for charges from 2-5: 0 1 0 0 + stringstream ss; + int i = minCharge; + while (i <= maxCharge) + { + if (charge != i) + { + ss << "0" << out_sep; + } + if (charge == i) + { + ss << "1" << out_sep; + } + i++; + } + + char aaBefore = it->getHits().front().getPeptideEvidences().front().getAABefore(); + char aaAfter = it->getHits().front().getPeptideEvidences().front().getAAAfter(); + + String peptide = aaBefore + string(".") + sequence + string(".") + aaAfter; + + // formula taken from percolator converter isEnz(n, c) for trypsin + bool enzN = isEnz(peptide.at(0), peptide.at(2), enz); + bool enzC = isEnz(peptide.at(peptide.size() - 3), peptide.at(peptide.size() - 1), enz); + int enzInt = countEnzymatic(sequence, enz); + String protein = it->getHits().front().getPeptideEvidences().front().getProteinAccession(); + + // One PeptideSpectrumHit with all its features + String lis = "_tandem_output_file_decoy_" + scannumber + "_" + String(charge) + "_1" + out_sep + String(label) + out_sep + scannumber + out_sep + String(hyperscore) + out_sep + String(deltascore) + out_sep + ss_ion_2.str() + out_sep + + String(mh) + out_sep + String(dm) + out_sep + String(absdM) + out_sep + String(length) + out_sep + ss.str() + out_sep + String(enzN) + out_sep + String(enzC) + out_sep + String(enzInt) + out_sep + peptide + out_sep + protein; + + // peptide Spectrum Hit pushed to the output file + txt.addLine(lis); + } + } + } + + void TopPerc::prepareMASCOTpin(vector& peptide_ids, string& enz, TextFile& txt, int minCharge, int maxCharge, char out_sep) + { + /** +Features 1-9 Represent the Basic Feature Set and Features 1-18 Represent the Extended Feature Set As Used in Mascot Percolatora + +feature abbreviation feature description +1. mass Calculated monoisotopic mass of the identified peptide. +2. charge Precursor ion charge +3. mScore Mascot score +4. dScore Mascot score minus Mascot score of next best nonisobaric peptide hit +5. deltaM Calculated minus observed peptide mass (in Dalton and ppm). +6. absDeltaM Absolute value of calculated minus observed peptide mass (in Dalton and ppm) +7. isoDeltaM Calculated minus observed peptide mass, isotope error corrected (in Dalton and ppm) +8. uniquePeps None (0), one (1), two or more (2) distinct peptide sequences match same protein +9. mc Missed tryptic cleavages +10. totInt Total ion intensity (log) +11. intMatchedTot Total matched ion intensity (log) +12. relIntMatchedTot Total matched ion intensity divided by total ion intensity +13. binom Peptide Score as described in ref 28 +14. fragMassError Mean fragment mass error (in Dalton and ppm) +15. absFragMassError Mean absolute fragment mass error (in Dalton and ppm) +16. fracIonsMatched Fraction of calculated ions matched (per ion series) +17. seqCov Sequence coverage of matched ions (per ion series) +18. intMatched Matched ion intensity (per ion series) + */ + + + // Create String of the charges for the header of the tab file + stringstream ss; + ss << "Charge" << minCharge << ", "; + for (int j = minCharge + 1; j < maxCharge + 1; j++) + { + + ss << "Charge" << j << ","; + } + + // Find out which ions are in XTandem-File and take only these as features + stringstream ss_ion; + if (peptide_ids.front().getHits().front().getMetaValue("a_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue("a_ions").toString() != "") + { + ss_ion << "frac_ion_a" << ","; + } + if (peptide_ids.front().getHits().front().getMetaValue("b_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue("b_ions").toString() != "") + { + ss_ion << "frac_ion_b" << ","; + } + if (peptide_ids.front().getHits().front().getMetaValue("c_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue("c_ions").toString() != "") + { + ss_ion << "frac_ion_c" << ","; + } + if (peptide_ids.front().getHits().front().getMetaValue("x_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue("x_ions").toString() != "") + { + ss_ion << "frac_ion_x" << ","; + } + if (peptide_ids.front().getHits().front().getMetaValue("y_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue("y_ions").toString() != "") + { + ss_ion << "frac_ion_y" << ","; + } + if (peptide_ids.front().getHits().front().getMetaValue("z_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue("z_ions").toString() != "") + { + ss_ion << "frac_ion_z" << ","; + } + + // Create header for the features + String featureset = "SpecId,Label,ScanNr,hyperscore,deltascore," + ss_ion.str() + + ",Mass,dM,absdM,PepLen," + ss.str() + "enzN,enzC,enzInt,Peptide,Proteins"; + StringList txt_header0 = ListUtils::create(featureset); + // Insert the header with the features names to the file + txt.addLine(ListUtils::concatenate(txt_header0, out_sep)); + + LOG_INFO << "read in target file" << endl; + // get all the features from the target file + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + { + if (it->isHigherScoreBetter()) + { + String scannumber = String(it->getMetaValue("spectrum_reference")); + int charge = it->getHits().front().getCharge(); + int label = 1; + double hyperscore = it->getHits().front().getScore(); + // deltascore = hyperscore - nextscore + double deltascore = hyperscore - it->getHits().front().getMetaValue("nextscore").toString().toDouble(); + String sequence = it->getHits().front().getSequence().toString(); + int length = sequence.length(); + + // Find out correct ion types and get its Values + stringstream ss_ion_2; + + if (it->getHits().front().getMetaValue("a_score").toString() != "" && + it->getHits().front().getMetaValue("a_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("a_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("b_score").toString() != "" && + it->getHits().front().getMetaValue("b_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("b_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("c_score").toString() != "" && + it->getHits().front().getMetaValue("c_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("c_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("x_score").toString() != "" && + it->getHits().front().getMetaValue("x_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("x_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("y_score").toString() != "" && + it->getHits().front().getMetaValue("y_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("y_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("z_score").toString() != "" && + it->getHits().front().getMetaValue("z_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("z_ions")) / length << out_sep; + } + double mass = it->getHits().front().getMetaValue("mass"); + double dm = it->getHits().front().getMetaValue("delta"); + double mh = mass + dm; + double absdM = abs(dm); + + // write 1 for the correct charge, 0 for other charges + // i.e.: charge 3 for charges from 2-5: 0 1 0 0 + stringstream ss; + int i = minCharge; + while (i <= maxCharge) + { + if (charge != i) + { + ss << "0" << out_sep; + } + if (charge == i) + { + ss << "1" << out_sep; + } + i++; + } + + char aaBefore = it->getHits().front().getPeptideEvidences().front().getAABefore(); + char aaAfter = it->getHits().front().getPeptideEvidences().front().getAAAfter(); + + String peptide = aaBefore + string(".") + sequence + string(".") + aaAfter; + + // formula taken from percolator converter isEnz(n, c) for trypsin + bool enzN = isEnz(peptide.at(0), peptide.at(2), enz); + bool enzC = isEnz(peptide.at(peptide.size() - 3), peptide.at(peptide.size() - 1), enz); + int enzInt = countEnzymatic(sequence, enz); + String protein = it->getHits().front().getPeptideEvidences().front().getProteinAccession(); + + // One PeptideSpectrumHit with all its features + String lis = "_tandem_output_file_target_" + scannumber + "_" + String(charge) + + "_1" + out_sep + String(label) + out_sep + scannumber + out_sep + String(hyperscore) + + out_sep + String(deltascore) + out_sep + ss_ion_2.str() + String(mh) + out_sep + + String(dm) + out_sep + String(absdM) + out_sep + String(length) + out_sep + String(ss.str()) + + String(enzN) + out_sep + String(enzC) + out_sep + String(enzInt) + out_sep + peptide + out_sep + protein; + + // peptide Spectrum Hit pushed to the output file + txt.addLine(lis); + } + } + + LOG_INFO << "read in decoy file" << endl; + // get all the features from the decoy file + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + { + if (it->isHigherScoreBetter()) + { + String scannumber = String(it->getMetaValue("spectrum_reference")); + int charge = it->getHits().front().getCharge(); + int label = -1; + double hyperscore = it->getHits().front().getScore(); + // deltascore = hyperscore - nextscore + double deltascore = hyperscore - it->getHits().front().getMetaValue("nextscore").toString().toDouble(); + String sequence = it->getHits().front().getSequence().toString(); + int length = sequence.length(); + + // Find out correct ion types and get its Values + stringstream ss_ion_2; + + if (it->getHits().front().getMetaValue("a_score").toString() != "" && it->getHits().front().getMetaValue("a_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("a_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("b_score").toString() != "" && it->getHits().front().getMetaValue("b_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("b_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("c_score").toString() != "" && it->getHits().front().getMetaValue("c_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("c_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("x_score").toString() != "" && it->getHits().front().getMetaValue("x_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("x_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("y_score").toString() != "" && it->getHits().front().getMetaValue("y_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("y_ions")) / length << out_sep; + } + if (it->getHits().front().getMetaValue("z_score").toString() != "" && it->getHits().front().getMetaValue("z_ions").toString() != "") + { + ss_ion_2 << double(it->getHits().front().getMetaValue("z_ions")) / length; + } + double mass = it->getHits().front().getMetaValue("mass"); + double dm = double(it->getHits().front().getMetaValue("delta")); + double mh = mass + dm; + double absdM = abs(dm); + + // write 1 for the correct charge, 0 for other charges + // i.e: charge 3 for charges from 2-5: 0 1 0 0 + stringstream ss; + int i = minCharge; + while (i <= maxCharge) + { + if (charge != i) + { + ss << "0" << out_sep; + } + if (charge == i) + { + ss << "1" << out_sep; + } + i++; + } + + char aaBefore = it->getHits().front().getPeptideEvidences().front().getAABefore(); + char aaAfter = it->getHits().front().getPeptideEvidences().front().getAAAfter(); + + String peptide = aaBefore + string(".") + sequence + string(".") + aaAfter; + + // formula taken from percolator converter isEnz(n, c) for trypsin + bool enzN = isEnz(peptide.at(0), peptide.at(2), enz); + bool enzC = isEnz(peptide.at(peptide.size() - 3), peptide.at(peptide.size() - 1), enz); + int enzInt = countEnzymatic(sequence, enz); + String protein = it->getHits().front().getPeptideEvidences().front().getProteinAccession(); + + // One PeptideSpectrumHit with all its features + String lis = "_tandem_output_file_decoy_" + scannumber + "_" + String(charge) + "_1" + out_sep + String(label) + out_sep + scannumber + out_sep + String(hyperscore) + out_sep + String(deltascore) + out_sep + ss_ion_2.str() + out_sep + + String(mh) + out_sep + String(dm) + out_sep + String(absdM) + out_sep + String(length) + out_sep + ss.str() + out_sep + String(enzN) + out_sep + String(enzC) + out_sep + String(enzInt) + out_sep + peptide + out_sep + protein; + + // peptide Spectrum Hit pushed to the output file + txt.addLine(lis); + } + } + } + + // Function taken from Enzyme.h from Percolator size_t TopPerc::countEnzymatic(String peptide, string enz) { From ac297da241aa0f145381aa85c607572f95760651 Mon Sep 17 00:00:00 2001 From: mwalzer Date: Fri, 27 Nov 2015 17:40:31 +0100 Subject: [PATCH 10/41] [FEATURE] added comet precolator input prep --- src/openms/source/ANALYSIS/ID/TopPerc.cpp | 689 ++++++---------------- 1 file changed, 174 insertions(+), 515 deletions(-) diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/TopPerc.cpp index 29ccccc60ef..52bd61b5638 100644 --- a/src/openms/source/ANALYSIS/ID/TopPerc.cpp +++ b/src/openms/source/ANALYSIS/ID/TopPerc.cpp @@ -39,68 +39,6 @@ using namespace std; namespace OpenMS { - - bool TopPerc::isEnz(const char& n, const char& c, string& enz) - { - if (enz == "trypsin") - { - return ((n == 'K' || n == 'R') && c != 'P') || n == '-' || c == '-'; - } - else if (enz == "chymotrypsin") - { - return ((n == 'F' || n == 'W' || n == 'Y' || n == 'L') && c != 'P') || n == '-' || c == '-'; - } - else if (enz == "thermolysin") - { - return ((c == 'A' || c == 'F' || c == 'I' || c == 'L' || c == 'M' - || c == 'V' || (n == 'R' && c == 'G')) && n != 'D' && n != 'E') || n == '-' || c == '-'; - } - else if (enz == "proteinasek") - { - return (n == 'A' || n == 'E' || n == 'F' || n == 'I' || n == 'L' - || n == 'T' || n == 'V' || n == 'W' || n == 'Y') || n == '-' || c == '-'; - } - else if (enz == "pepsin") - { - return ((c == 'F' || c == 'L' || c == 'W' || c == 'Y' || n == 'F' - || n == 'L' || n == 'W' || n == 'Y') && n != 'R') || n == '-' || c == '-'; - } - else if (enz == "elastase") - { - return ((n == 'L' || n == 'V' || n == 'A' || n == 'G') && c != 'P') - || n == '-' || c == '-'; - } - else if (enz == "lys-n") - { - return (c == 'K') - || n == '-' || c == '-'; - } - else if (enz == "lys-c") - { - return ((n == 'K') && c != 'P') - || n == '-' || c == '-'; - } - else if (enz == "arg-c") - { - return ((n == 'R') && c != 'P') - || n == '-' || c == '-'; - } - else if (enz == "asp-n") - { - return (c == 'D') - || n == '-' || c == '-'; - } - else if (enz == "glu-c") - { - return ((n == 'E') && (c != 'P')) - || n == '-' || c == '-'; - } - else - { - return true; - } - } - void TopPerc::prepareCUSTOMpin(vector& peptide_ids, string& enz, TextFile& txt, vector& user_param_features, char out_sep) { // Create header for the features @@ -552,22 +490,6 @@ namespace OpenMS void TopPerc::prepareCOMETpin(vector& peptide_ids, string& enz, TextFile& txt, int minCharge, int maxCharge, char out_sep) { - /** -no decoy comet search -id label ScanNr lnrSp deltLCn deltCn lnExpect Xcorr Sp IonFrac Mass PepLen Charge1 Charge2 Charge3 Charge4 Charge5 Charge6 enzN enzC enzInt lnNumSP dM absdM peptide proteinId1 -/home/.../150209_msms4_45_3_1 1 45 2.564949 0.120106 0.058356 1.58511 0.917335 94.189621 0.1875 1541.939549 13 0 0 1 0 0 0 0 0 2 8.659387 -0.000001 0.000001 H.FVIIIRKQTDLPV.I XXX_sp|P30307|MPIP3_HUMAN -/home/.../150209_msms4_55_2_1 1 55 2.484907 0.19764 0.077428 0.919862 0.757954 66.903687 0.375 1087.697313 9 0 1 0 0 0 0 0 0 3 7.689829 0.000002 0.000002 G.TRSLKRLLT.A XXX_sp|Q8IWD5|MFS6L_HUMAN -/home/.../150209_msms4_58_2_1 1 58 2.772589 0.304528 0.119574 0.471976 0.875639 44.227581 0.2222 1086.695849 10 0 1 0 0 0 0 1 0 5 8.254269 -0.000003 0.000003 K.KSAKKTPKKA.K sp|P16403|H12_HUMAN -/home/.../150209_msms4_70_3_1 1 70 1.609438 0.22949 0.161329 0.194216 1.455887 249.314102 0.2708 1399.760349 13 0 0 1 0 0 0 0 1 1 10.29438 0.000009 0.000009 V.KFNGAHIPGSPFK.I sp|Q14315|FLNC_HUMAN -/home/.../150209_msms4_85_3_1 1 85 0 0.330495 0.2621 -1.688895 2.168427 926.547668 0.4545 1412.87949 12 0 0 1 0 0 0 1 0 5 8.278428 -0.000001 0.000001 K.RAKAKTTKKRPQ.R sp|P24844|MYL9_HUMAN -/home/.../150209_msms4_89_3_1 1 89 2.995732 0.105586 0.028294 1.622349 1.383884 216.628876 0.2857 1412.88068 15 0 0 1 0 0 0 0 0 0 8.322637 0.000006 0.000006 I.AVVSVTVLLAISLAG.M sp|P55017|S12A3_HUMAN -/home/.../150209_msms4_93_3_1 1 93 3.931826 0.247239 0.074946 0.447477 1.304711 117.67543 0.2115 1480.967259 14 0 0 1 0 0 0 0 1 4 6.850126 -0.000001 0.000001 P.AKKPKAAKAKKPSK.A XXX_sp|P10412|H14_HUMAN -/home/.../150209_msms4_95_2_1 1 95 0.693147 0.088233 0.011378 1.549132 1.088933 306.906311 0.5 851.43651 8 0 1 0 0 0 0 1 0 0 10.106918 0.000008 0.000008 R.FSPGIPAY.P sp|Q96MF6|CQ10A_HUMAN -/home/.../150209_msms4_97_3_1 1 97 4.007333 0.090959 0.061997 1.593871 1.009504 106.411644 0.1538 1569.985447 14 0 0 1 0 0 0 0 1 4 7.784889 -0.000004 0.000004 T.RRSQALKKLVGSVK.S XXX_sp|P13994|CC130_HUMAN -/home/.../150209_msms4_99_3_1 1 99 0 0.43843 0.227306 -3.148641 2.312463 637.051819 0.4038 1480.967716 14 0 0 1 0 0 0 1 1 5 6.919684 -0.000001 0.000001 K.KAKSPKKAKAAKPK.K sp|P10412|H14_HUMAN -/home/.../150209_msms4_100_3_1 1 100 4.406719 0.091476 0.051751 1.433108 1.358069 157.195969 0.2115 1568.978001 14 0 0 1 0 0 0 0 0 5 7.985484 -0.000007 0.000007 A.KALKGKEPPKKVFV.G sp|O14979|HNRDL_HUMAN -/home/.../150209_msms4_103_3_1 1 103 4.094345 0.20423 0.129201 0.77962 0.981422 40.896797 0.1607 1568.984593 15 0 0 1 0 0 0 0 0 0 7.904704 0.00001 0.00001 N.LLGLIEMILLSVGVV.M sp|P16671|CD36_HUMAN -/home/.../150209_msms4_104_3_1 1 104 3.970292 0.290208 0.198715 -0.32615 1.332213 89.666199 0.1875 1587.912022 13 0 0 1 0 0 0 0 1 3 9.75022 -0.000004 0.000004 T.QDGLFLRRAISRR.Y XXX_sp|Q2LD37|K1109_HUMAN - */ /** -with decoy comet search id label ScanNr lnrSp deltLCn deltCn lnExpect Xcorr Sp IonFrac Mass PepLen Charge1 Charge2 Charge3 Charge4 Charge5 Charge6 enzN enzC enzInt lnNumSP dM absdM peptide proteinId1 /home/.../150209_msms4_45_3_1 1 45 2.890372 0.066992 0.055908 2.212066 0.917335 94.189621 0.1875 1541.939549 13 0 0 1 0 0 0 0 0 2 9.352534 -0.000001 0.000001 H.FVIIIRKQTDLPV.I XXX_sp|P30307|MPIP3_HUMAN @@ -579,486 +501,223 @@ id label ScanNr lnrSp deltLCn deltCn lnExpect Xcorr Sp IonFrac Mass PepLen Charg // Create String of the charges for the header of the tab file stringstream ss; ss << "Charge" << minCharge << ", "; - for (int j = minCharge + 1; j < maxCharge + 1; j++) + for (int j = minCharge+1; j <= maxCharge; j++) { - ss << "Charge" << j << ","; } - // Find out which ions are in XTandem-File and take only these as features - stringstream ss_ion; - if (peptide_ids.front().getHits().front().getMetaValue("a_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("a_ions").toString() != "") - { - ss_ion << "frac_ion_a" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("b_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("b_ions").toString() != "") - { - ss_ion << "frac_ion_b" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("c_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("c_ions").toString() != "") - { - ss_ion << "frac_ion_c" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("x_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("x_ions").toString() != "") - { - ss_ion << "frac_ion_x" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("y_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("y_ions").toString() != "") - { - ss_ion << "frac_ion_y" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("z_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("z_ions").toString() != "") - { - ss_ion << "frac_ion_z" << ","; - } - - // Create header for the features - String featureset = "SpecId,Label,ScanNr,hyperscore,deltascore," + ss_ion.str() + - ",Mass,dM,absdM,PepLen," + ss.str() + "enzN,enzC,enzInt,Peptide,Proteins"; - StringList txt_header0 = ListUtils::create(featureset); + String featureset = "id,label,ScanNr,lnrSp,deltLCn,deltCn,lnExpect,Xcorr,Sp,IonFrac,Mass,PepLen, + + ss.str() + + "enzN,enzC,enzInt,lnNumSP,dM,absdM,peptide,proteinId1"; + StringList txt_header = ListUtils::create(featureset); // Insert the header with the features names to the file - txt.addLine(ListUtils::concatenate(txt_header0, out_sep)); + txt.addLine(ListUtils::concatenate(txt_header, out_sep)); - LOG_INFO << "read in target file" << endl; - // get all the features from the target file + // get all the feature values for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) { - if (it->isHigherScoreBetter()) + double deltaLCn = 0; + for (vector::iterator jt = it->getHits().begin(); jt != it->getHits().end(); ++jt) { - String scannumber = String(it->getMetaValue("spectrum_reference")); - int charge = it->getHits().front().getCharge(); + deltaLCn += jt->getMetaValue("MS:1002253"); + } + it->sort(); + it->assignRanks(); + String scannumber = String(it->getMetaValue("spectrum_reference")); + for (vector::iterator jt = hits.begin(); jt != hits.end(); ++jt) + { + StringList idents; + sid.push_back(it->getBaseName()); + sid.push_back(scannumber); + sid.push_back(String(jt->getRank())); + String sid = ListUtils::concatenate(idents, "_"); + int charge = jt->getCharge(); int label = 1; - double hyperscore = it->getHits().front().getScore(); - // deltascore = hyperscore - nextscore - double deltascore = hyperscore - it->getHits().front().getMetaValue("nextscore").toString().toDouble(); - String sequence = it->getHits().front().getSequence().toString(); - int length = sequence.length(); - - // Find out correct ion types and get its Values - stringstream ss_ion_2; - - if (it->getHits().front().getMetaValue("a_score").toString() != "" && - it->getHits().front().getMetaValue("a_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("a_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("b_score").toString() != "" && - it->getHits().front().getMetaValue("b_ions").toString() != "") + if (jt->metaValueExists("target_decoy") && jt->getMetaValue("target_decoy").hasSubstring("decoy")) { - ss_ion_2 << double(it->getHits().front().getMetaValue("b_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("c_score").toString() != "" && - it->getHits().front().getMetaValue("c_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("c_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("x_score").toString() != "" && - it->getHits().front().getMetaValue("x_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("x_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("y_score").toString() != "" && - it->getHits().front().getMetaValue("y_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("y_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("z_score").toString() != "" && - it->getHits().front().getMetaValue("z_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("z_ions")) / length << out_sep; + label = -1; } - double mass = it->getHits().front().getMetaValue("mass"); - double dm = it->getHits().front().getMetaValue("delta"); - double mh = mass + dm; - double absdM = abs(dm); - + //Xcorr + String xcorr = String(jt->getMetaValue("MS:1002252")); + //deltCn + String deltaCn = String(jt->getMetaValue("MS:1002253")); + //TODO in comet pep.xml consumption get deltaCn + //deltLCn deltaCn between first and last, i.e. sum in peptidehit + //lnExpect + String lnExpect = String(log(jt->getMetaValue("MS:1002257"))); + //Sp + String sp = String(jt->getMetaValue("MS:1002255")); + //lnrSp log n rank Sp + String lnrSp = String(log(jt->getMetaValue("MS:1002256"))); + //TODO in comet pep.xml consumption get SP rank into MetaValue + //IonFrac + String ionfrac = jt->getMetaValue("MS:1002258")/jt->getMetaValue("MS:1002259"); + //TODO in comet pep.xml consumption get matched ions and total ions + //Mass + double mass = jt->getSequence().getMonoWeight(Residue::Full, charge)/charge; + //PepLen + int peplen = jt->getSequence().size(); //NB comet assigns peplen 0 to decoys? + //Chargen + StringList chargen; // write 1 for the correct charge, 0 for other charges - // i.e.: charge 3 for charges from 2-5: 0 1 0 0 - stringstream ss; - int i = minCharge; - while (i <= maxCharge) - { - if (charge != i) - { - ss << "0" << out_sep; - } - if (charge == i) - { - ss << "1" << out_sep; - } - i++; - } - - char aaBefore = it->getHits().front().getPeptideEvidences().front().getAABefore(); - char aaAfter = it->getHits().front().getPeptideEvidences().front().getAAAfter(); - - String peptide = aaBefore + string(".") + sequence + string(".") + aaAfter; - - // formula taken from percolator converter isEnz(n, c) for trypsin + for (int i = minCharge; i <= maxCharge; ++i) + { + if (charge != i) + { + chargen.push_back("0"); + } + else + { + chargen.push_back("1"); + } + } + //enzN bool enzN = isEnz(peptide.at(0), peptide.at(2), enz); + //enzC bool enzC = isEnz(peptide.at(peptide.size() - 3), peptide.at(peptide.size() - 1), enz); + //enzInt int enzInt = countEnzymatic(sequence, enz); - String protein = it->getHits().front().getPeptideEvidences().front().getProteinAccession(); - - // One PeptideSpectrumHit with all its features - String lis = "_tandem_output_file_target_" + scannumber + "_" + String(charge) + - "_1" + out_sep + String(label) + out_sep + scannumber + out_sep + String(hyperscore) + - out_sep + String(deltascore) + out_sep + ss_ion_2.str() + String(mh) + out_sep + - String(dm) + out_sep + String(absdM) + out_sep + String(length) + out_sep + String(ss.str()) + - String(enzN) + out_sep + String(enzC) + out_sep + String(enzInt) + out_sep + peptide + out_sep + protein; - - // peptide Spectrum Hit pushed to the output file - txt.addLine(lis); + //lnNumSP + //this is practically not obtainable, as this seems to be the logn of the number of + //internally matched decoy or target hits to that spectrum query depending on the current hit itself + //is approximated by number of matched peptides + String lnNumSP = String(log(jt->getMetaValue("matched_peptides"))); + //TODO in comet pep.xml consumption get matched_peptides into PeptideHit + //dM + double dm = it->getMZ() - mass; + //absdM + double absdM = abs(dm); + //peptide + String sequence = ""; + sequence += String(jt->getPeptideEvidences().front().getAABefore()); // just first peptide evidence + sequence += jt->getSequence().toString(); + sequence += String(jt->getPeptideEvidences().front().getAAAfter()); //just first peptide evidence + //proteinId1 + String pepevid = ""; + for (vector::iterator kt = jt->getPeptideEvidences().begin(); kt != jt->getPeptideEvidences().end(); ++kt) + { + pev +=kt->getProteinAccession(); + } + + StringList row; + row.push_back(sid); + row.push_back(label); + row.push_back(scannumber); + row.push_back(lnrSp); + row.push_back(deltaLCn); + row.push_back(deltaCn); + row.push_back(lnExpect); + row.push_back(xcorr); + row.push_back(sp); + row.push_back(ionfrac); + row.push_back(String(mass)); + row.push_back(String(peplen)); + row.push_back(ListUtils::concatenate(chargen, out_sep)); + row.push_back(String(enzN)); + row.push_back(String(enzC)); + row.push_back(String(enzInt)); + row.push_back(lnNumSP); + row.push_back(String(dM)); + row.push_back(String(absdM)); + row.push_back(sequence); + row.push_back(pepevid); + + txt.addLine(ListUtils::concatenate(row, out_sep)); } } + } - LOG_INFO << "read in decoy file" << endl; - // get all the features from the decoy file - for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) - { - if (it->isHigherScoreBetter()) - { - String scannumber = String(it->getMetaValue("spectrum_reference")); - int charge = it->getHits().front().getCharge(); - int label = -1; - double hyperscore = it->getHits().front().getScore(); - // deltascore = hyperscore - nextscore - double deltascore = hyperscore - it->getHits().front().getMetaValue("nextscore").toString().toDouble(); - String sequence = it->getHits().front().getSequence().toString(); - int length = sequence.length(); + void TopPerc::prepareMASCOTpin(vector& peptide_ids, string& enz, TextFile& txt, int minCharge, int maxCharge, char out_sep) + { + /** +Features 1-9 Represent the Basic Feature Set and Features 10-18 Represent the Extended Feature Set As Used in Mascot Percolator - // Find out correct ion types and get its Values - stringstream ss_ion_2; +feature abbreviation feature description +1. mass Calculated monoisotopic mass of the identified peptide. +2. charge Precursor ion charge +3. mScore Mascot score +4. dScore Mascot score minus Mascot score of next best nonisobaric peptide hit +5. deltaM Calculated minus observed peptide mass (in Dalton and ppm). +6. absDeltaM Absolute value of calculated minus observed peptide mass (in Dalton and ppm) +7. isoDeltaM Calculated minus observed peptide mass, isotope error corrected (in Dalton and ppm) +8. uniquePeps None (0), one (1), two or more (2) distinct peptide sequences match same protein +9. mc Missed tryptic cleavages +10. totInt Total ion intensity (log) +11. intMatchedTot Total matched ion intensity (log) +12. relIntMatchedTot Total matched ion intensity divided by total ion intensity +13. binom Peptide Score as described in ref 28 +14. fragMassError Mean fragment mass error (in Dalton and ppm) +15. absFragMassError Mean absolute fragment mass error (in Dalton and ppm) +16. fracIonsMatched Fraction of calculated ions matched (per ion series) +17. seqCov Sequence coverage of matched ions (per ion series) +18. intMatched Matched ion intensity (per ion series) + */ - if (it->getHits().front().getMetaValue("a_score").toString() != "" && it->getHits().front().getMetaValue("a_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("a_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("b_score").toString() != "" && it->getHits().front().getMetaValue("b_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("b_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("c_score").toString() != "" && it->getHits().front().getMetaValue("c_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("c_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("x_score").toString() != "" && it->getHits().front().getMetaValue("x_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("x_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("y_score").toString() != "" && it->getHits().front().getMetaValue("y_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("y_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("z_score").toString() != "" && it->getHits().front().getMetaValue("z_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("z_ions")) / length; - } - double mass = it->getHits().front().getMetaValue("mass"); - double dm = double(it->getHits().front().getMetaValue("delta")); - double mh = mass + dm; - double absdM = abs(dm); - - // write 1 for the correct charge, 0 for other charges - // i.e: charge 3 for charges from 2-5: 0 1 0 0 - stringstream ss; - int i = minCharge; - while (i <= maxCharge) - { - if (charge != i) - { - ss << "0" << out_sep; - } - if (charge == i) - { - ss << "1" << out_sep; - } - i++; - } - - char aaBefore = it->getHits().front().getPeptideEvidences().front().getAABefore(); - char aaAfter = it->getHits().front().getPeptideEvidences().front().getAAAfter(); - - String peptide = aaBefore + string(".") + sequence + string(".") + aaAfter; - - // formula taken from percolator converter isEnz(n, c) for trypsin - bool enzN = isEnz(peptide.at(0), peptide.at(2), enz); - bool enzC = isEnz(peptide.at(peptide.size() - 3), peptide.at(peptide.size() - 1), enz); - int enzInt = countEnzymatic(sequence, enz); - String protein = it->getHits().front().getPeptideEvidences().front().getProteinAccession(); - - // One PeptideSpectrumHit with all its features - String lis = "_tandem_output_file_decoy_" + scannumber + "_" + String(charge) + "_1" + out_sep + String(label) + out_sep + scannumber + out_sep + String(hyperscore) + out_sep + String(deltascore) + out_sep + ss_ion_2.str() + out_sep - + String(mh) + out_sep + String(dm) + out_sep + String(absdM) + out_sep + String(length) + out_sep + ss.str() + out_sep + String(enzN) + out_sep + String(enzC) + out_sep + String(enzInt) + out_sep + peptide + out_sep + protein; - - // peptide Spectrum Hit pushed to the output file - txt.addLine(lis); - } - } } - void TopPerc::prepareMASCOTpin(vector& peptide_ids, string& enz, TextFile& txt, int minCharge, int maxCharge, char out_sep) + // Function adapted from Enzyme.h in Percolator converter + bool TopPerc::isEnz(const char& n, const char& c, string& enz) { - /** -Features 1-9 Represent the Basic Feature Set and Features 1-18 Represent the Extended Feature Set As Used in Mascot Percolatora - -feature abbreviation feature description -1. mass Calculated monoisotopic mass of the identified peptide. -2. charge Precursor ion charge -3. mScore Mascot score -4. dScore Mascot score minus Mascot score of next best nonisobaric peptide hit -5. deltaM Calculated minus observed peptide mass (in Dalton and ppm). -6. absDeltaM Absolute value of calculated minus observed peptide mass (in Dalton and ppm) -7. isoDeltaM Calculated minus observed peptide mass, isotope error corrected (in Dalton and ppm) -8. uniquePeps None (0), one (1), two or more (2) distinct peptide sequences match same protein -9. mc Missed tryptic cleavages -10. totInt Total ion intensity (log) -11. intMatchedTot Total matched ion intensity (log) -12. relIntMatchedTot Total matched ion intensity divided by total ion intensity -13. binom Peptide Score as described in ref 28 -14. fragMassError Mean fragment mass error (in Dalton and ppm) -15. absFragMassError Mean absolute fragment mass error (in Dalton and ppm) -16. fracIonsMatched Fraction of calculated ions matched (per ion series) -17. seqCov Sequence coverage of matched ions (per ion series) -18. intMatched Matched ion intensity (per ion series) - */ - - - // Create String of the charges for the header of the tab file - stringstream ss; - ss << "Charge" << minCharge << ", "; - for (int j = minCharge + 1; j < maxCharge + 1; j++) + if (enz == "trypsin") { - - ss << "Charge" << j << ","; + return ((n == 'K' || n == 'R') && c != 'P') || n == '-' || c == '-'; } - - // Find out which ions are in XTandem-File and take only these as features - stringstream ss_ion; - if (peptide_ids.front().getHits().front().getMetaValue("a_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("a_ions").toString() != "") + else if (enz == "chymotrypsin") { - ss_ion << "frac_ion_a" << ","; + return ((n == 'F' || n == 'W' || n == 'Y' || n == 'L') && c != 'P') || n == '-' || c == '-'; } - if (peptide_ids.front().getHits().front().getMetaValue("b_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("b_ions").toString() != "") + else if (enz == "thermolysin") { - ss_ion << "frac_ion_b" << ","; + return ((c == 'A' || c == 'F' || c == 'I' || c == 'L' || c == 'M' + || c == 'V' || (n == 'R' && c == 'G')) && n != 'D' && n != 'E') || n == '-' || c == '-'; } - if (peptide_ids.front().getHits().front().getMetaValue("c_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("c_ions").toString() != "") + else if (enz == "proteinasek") { - ss_ion << "frac_ion_c" << ","; + return (n == 'A' || n == 'E' || n == 'F' || n == 'I' || n == 'L' + || n == 'T' || n == 'V' || n == 'W' || n == 'Y') || n == '-' || c == '-'; } - if (peptide_ids.front().getHits().front().getMetaValue("x_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("x_ions").toString() != "") + else if (enz == "pepsin") { - ss_ion << "frac_ion_x" << ","; + return ((c == 'F' || c == 'L' || c == 'W' || c == 'Y' || n == 'F' + || n == 'L' || n == 'W' || n == 'Y') && n != 'R') || n == '-' || c == '-'; } - if (peptide_ids.front().getHits().front().getMetaValue("y_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("y_ions").toString() != "") + else if (enz == "elastase") { - ss_ion << "frac_ion_y" << ","; + return ((n == 'L' || n == 'V' || n == 'A' || n == 'G') && c != 'P') + || n == '-' || c == '-'; } - if (peptide_ids.front().getHits().front().getMetaValue("z_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("z_ions").toString() != "") + else if (enz == "lys-n") { - ss_ion << "frac_ion_z" << ","; + return (c == 'K') + || n == '-' || c == '-'; } - - // Create header for the features - String featureset = "SpecId,Label,ScanNr,hyperscore,deltascore," + ss_ion.str() + - ",Mass,dM,absdM,PepLen," + ss.str() + "enzN,enzC,enzInt,Peptide,Proteins"; - StringList txt_header0 = ListUtils::create(featureset); - // Insert the header with the features names to the file - txt.addLine(ListUtils::concatenate(txt_header0, out_sep)); - - LOG_INFO << "read in target file" << endl; - // get all the features from the target file - for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + else if (enz == "lys-c") { - if (it->isHigherScoreBetter()) - { - String scannumber = String(it->getMetaValue("spectrum_reference")); - int charge = it->getHits().front().getCharge(); - int label = 1; - double hyperscore = it->getHits().front().getScore(); - // deltascore = hyperscore - nextscore - double deltascore = hyperscore - it->getHits().front().getMetaValue("nextscore").toString().toDouble(); - String sequence = it->getHits().front().getSequence().toString(); - int length = sequence.length(); - - // Find out correct ion types and get its Values - stringstream ss_ion_2; - - if (it->getHits().front().getMetaValue("a_score").toString() != "" && - it->getHits().front().getMetaValue("a_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("a_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("b_score").toString() != "" && - it->getHits().front().getMetaValue("b_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("b_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("c_score").toString() != "" && - it->getHits().front().getMetaValue("c_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("c_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("x_score").toString() != "" && - it->getHits().front().getMetaValue("x_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("x_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("y_score").toString() != "" && - it->getHits().front().getMetaValue("y_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("y_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("z_score").toString() != "" && - it->getHits().front().getMetaValue("z_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("z_ions")) / length << out_sep; - } - double mass = it->getHits().front().getMetaValue("mass"); - double dm = it->getHits().front().getMetaValue("delta"); - double mh = mass + dm; - double absdM = abs(dm); - - // write 1 for the correct charge, 0 for other charges - // i.e.: charge 3 for charges from 2-5: 0 1 0 0 - stringstream ss; - int i = minCharge; - while (i <= maxCharge) - { - if (charge != i) - { - ss << "0" << out_sep; - } - if (charge == i) - { - ss << "1" << out_sep; - } - i++; - } - - char aaBefore = it->getHits().front().getPeptideEvidences().front().getAABefore(); - char aaAfter = it->getHits().front().getPeptideEvidences().front().getAAAfter(); - - String peptide = aaBefore + string(".") + sequence + string(".") + aaAfter; - - // formula taken from percolator converter isEnz(n, c) for trypsin - bool enzN = isEnz(peptide.at(0), peptide.at(2), enz); - bool enzC = isEnz(peptide.at(peptide.size() - 3), peptide.at(peptide.size() - 1), enz); - int enzInt = countEnzymatic(sequence, enz); - String protein = it->getHits().front().getPeptideEvidences().front().getProteinAccession(); - - // One PeptideSpectrumHit with all its features - String lis = "_tandem_output_file_target_" + scannumber + "_" + String(charge) + - "_1" + out_sep + String(label) + out_sep + scannumber + out_sep + String(hyperscore) + - out_sep + String(deltascore) + out_sep + ss_ion_2.str() + String(mh) + out_sep + - String(dm) + out_sep + String(absdM) + out_sep + String(length) + out_sep + String(ss.str()) + - String(enzN) + out_sep + String(enzC) + out_sep + String(enzInt) + out_sep + peptide + out_sep + protein; - - // peptide Spectrum Hit pushed to the output file - txt.addLine(lis); - } + return ((n == 'K') && c != 'P') + || n == '-' || c == '-'; } - - LOG_INFO << "read in decoy file" << endl; - // get all the features from the decoy file - for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + else if (enz == "arg-c") { - if (it->isHigherScoreBetter()) - { - String scannumber = String(it->getMetaValue("spectrum_reference")); - int charge = it->getHits().front().getCharge(); - int label = -1; - double hyperscore = it->getHits().front().getScore(); - // deltascore = hyperscore - nextscore - double deltascore = hyperscore - it->getHits().front().getMetaValue("nextscore").toString().toDouble(); - String sequence = it->getHits().front().getSequence().toString(); - int length = sequence.length(); - - // Find out correct ion types and get its Values - stringstream ss_ion_2; - - if (it->getHits().front().getMetaValue("a_score").toString() != "" && it->getHits().front().getMetaValue("a_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("a_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("b_score").toString() != "" && it->getHits().front().getMetaValue("b_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("b_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("c_score").toString() != "" && it->getHits().front().getMetaValue("c_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("c_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("x_score").toString() != "" && it->getHits().front().getMetaValue("x_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("x_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("y_score").toString() != "" && it->getHits().front().getMetaValue("y_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("y_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("z_score").toString() != "" && it->getHits().front().getMetaValue("z_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("z_ions")) / length; - } - double mass = it->getHits().front().getMetaValue("mass"); - double dm = double(it->getHits().front().getMetaValue("delta")); - double mh = mass + dm; - double absdM = abs(dm); - - // write 1 for the correct charge, 0 for other charges - // i.e: charge 3 for charges from 2-5: 0 1 0 0 - stringstream ss; - int i = minCharge; - while (i <= maxCharge) - { - if (charge != i) - { - ss << "0" << out_sep; - } - if (charge == i) - { - ss << "1" << out_sep; - } - i++; - } - - char aaBefore = it->getHits().front().getPeptideEvidences().front().getAABefore(); - char aaAfter = it->getHits().front().getPeptideEvidences().front().getAAAfter(); - - String peptide = aaBefore + string(".") + sequence + string(".") + aaAfter; - - // formula taken from percolator converter isEnz(n, c) for trypsin - bool enzN = isEnz(peptide.at(0), peptide.at(2), enz); - bool enzC = isEnz(peptide.at(peptide.size() - 3), peptide.at(peptide.size() - 1), enz); - int enzInt = countEnzymatic(sequence, enz); - String protein = it->getHits().front().getPeptideEvidences().front().getProteinAccession(); - - // One PeptideSpectrumHit with all its features - String lis = "_tandem_output_file_decoy_" + scannumber + "_" + String(charge) + "_1" + out_sep + String(label) + out_sep + scannumber + out_sep + String(hyperscore) + out_sep + String(deltascore) + out_sep + ss_ion_2.str() + out_sep - + String(mh) + out_sep + String(dm) + out_sep + String(absdM) + out_sep + String(length) + out_sep + ss.str() + out_sep + String(enzN) + out_sep + String(enzC) + out_sep + String(enzInt) + out_sep + peptide + out_sep + protein; - - // peptide Spectrum Hit pushed to the output file - txt.addLine(lis); - } + return ((n == 'R') && c != 'P') + || n == '-' || c == '-'; + } + else if (enz == "asp-n") + { + return (c == 'D') + || n == '-' || c == '-'; + } + else if (enz == "glu-c") + { + return ((n == 'E') && (c != 'P')) + || n == '-' || c == '-'; + } + else + { + return true; } } - - // Function taken from Enzyme.h from Percolator + // Function adapted from Enzyme.h in Percolator converter size_t TopPerc::countEnzymatic(String peptide, string enz) { size_t count = 0; @@ -1072,7 +731,7 @@ feature abbreviation feature description return count; } - // Function taken from the percolator converter MsgfplusReader + // Function adapted from MsgfplusReader in Percolator converter double TopPerc::rescaleFragmentFeature(double featureValue, int NumMatchedMainIons) { // Rescale the fragment features to penalize features calculated by few ions From 43d6a9841dbfd84919d03d32cdc665e921e51968 Mon Sep 17 00:00:00 2001 From: mwalzer Date: Fri, 27 Nov 2015 19:42:22 +0100 Subject: [PATCH 11/41] [FIX] changed scan referencing --- .../include/OpenMS/ANALYSIS/ID/TopPerc.h | 7 +- src/openms/source/ANALYSIS/ID/TopPerc.cpp | 92 ++++++++++--------- 2 files changed, 54 insertions(+), 45 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h index 45d38f90928..0c78e427b5c 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h @@ -50,11 +50,14 @@ namespace OpenMS { public: static bool isEnz(const char& n, const char& c, std::string& enz); - static void prepareCUSTOMpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, std::vector& user_param_features, char out_sep='\t'); + static void prepareCUSTOMpin(std::vector& peptide_ids, TextFile& txt, std::vector& user_param_features, char out_sep='\t'); static void prepareMSGFpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int minCharge, int maxCharge, bool addMHC = false, char out_sep='\t'); static void prepareXTANDEMpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int minCharge, int maxCharge, char out_sep='\t'); - static size_t countEnzymatic(String peptide, std::string enz); + static void prepareCOMETpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int minCharge, int maxCharge, char out_sep='\t'); + static void prepareMASCOTpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int minCharge, int maxCharge, char out_sep='\t'); + static size_t countEnzymatic(String peptide, std::string& enz); static double rescaleFragmentFeature(double featureValue, int NumMatchedMainIons); + static String getScanIdentifier(std::vector::iterator it, std::vector::iterator start); private: TopPerc(); virtual ~TopPerc(); diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/TopPerc.cpp index 52bd61b5638..edd7097b153 100644 --- a/src/openms/source/ANALYSIS/ID/TopPerc.cpp +++ b/src/openms/source/ANALYSIS/ID/TopPerc.cpp @@ -39,7 +39,7 @@ using namespace std; namespace OpenMS { - void TopPerc::prepareCUSTOMpin(vector& peptide_ids, string& enz, TextFile& txt, vector& user_param_features, char out_sep) + void TopPerc::prepareCUSTOMpin(vector& peptide_ids, TextFile& txt, vector& user_param_features, char out_sep) { // Create header for the features string min_featureset = "SpecId, Label, ScanNr"; @@ -51,24 +51,18 @@ namespace OpenMS { for (vector::const_iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) { - String spec_ref = it->getMetaValue("spectrum_reference").toString(); - vector scan_id; - spec_ref.split("scan=", scan_id); - String sid = scan_id.back(); + String exp_ref = it->getMetaValue("spectrum_reference").toString(); + String scannumber = getScanIdentifier(it, peptide_ids.begin()); int label = 1; - String SpecId = "target_SII_"; - if ((String(hit->getMetaValue("target_decoy"))).hasSubstring("decoy")) + if (hit->metaValueExists("target_decoy") && String(hit->getMetaValue("target_decoy")).hasSubstring("decoy")) { - SpecId = "decoy_SII_"; label = -1; } - SpecId += sid + "_" + String(hit->getCharge()); - StringList collected_feats; - collected_feats.push_back(SpecId); - collected_feats.push_back(label); - collected_feats.push_back(sid); + collected_feats.push_back(exp_ref); + collected_feats.push_back(String(label)); + collected_feats.push_back(scannumber); for (vector::const_iterator feat = user_param_features.begin(); feat != user_param_features.end(); ++feat) { @@ -130,11 +124,7 @@ namespace OpenMS int rank = hit->getRank(); int charge = hit->getCharge(); - String spec_ref = it->getMetaValue("spectrum_reference").toString(); - vector scan_id; - spec_ref.split("scan=", scan_id); - String sid = scan_id.back(); - + String scannumber = getScanIdentifier(it, peptide_ids.begin()); int label = 1; String SpecId = "target_SII_"; if ((String(hit->getMetaValue("target_decoy"))).hasSubstring("decoy")) @@ -143,7 +133,7 @@ namespace OpenMS label = -1; } - SpecId += sid + "_" + String(rank) + "_" + sid + "_" + String(charge) + "_" + String(rank); + SpecId += scannumber + "_" + String(rank) + "_" + String(charge); double rawScore = hit->getMetaValue("MS:1002049").toString().toDouble(); double denovoScore = hit->getMetaValue("MS:1002050").toString().toDouble(); @@ -222,7 +212,7 @@ namespace OpenMS String protein = hit->getPeptideEvidences().front().getProteinAccession(); // One PeptideSpectrumHit with all its features - String lis = SpecId + out_sep + String(label) + out_sep + scan_id[1] + out_sep + (String)rawScore + out_sep + + String lis = SpecId + out_sep + String(label) + out_sep + scannumber + out_sep + (String)rawScore + out_sep + (String)denovoScore + out_sep + (String)scoreRatio + out_sep + (String)energy + out_sep + (String)ln_eval + out_sep + (String)isotopeError + out_sep + (String)lnExplainedIonCurrentRatio + out_sep + (String)lnNTermIonCurrentRatio + out_sep + (String)lnCTermIonCurrentRatio + out_sep + (String)lnMS2IonCurrent @@ -306,9 +296,9 @@ namespace OpenMS // Create header for the features String featureset = "SpecId,Label,ScanNr,hyperscore,deltascore," + ss_ion.str() + ",Mass,dM,absdM,PepLen," + ss.str() + "enzN,enzC,enzInt,Peptide,Proteins"; - StringList txt_header0 = ListUtils::create(featureset); + StringList txt_header = ListUtils::create(featureset); // Insert the header with the features names to the file - txt.addLine(ListUtils::concatenate(txt_header0, out_sep)); + txt.addLine(ListUtils::concatenate(txt_header, out_sep)); LOG_INFO << "read in target file" << endl; // get all the features from the target file @@ -316,7 +306,7 @@ namespace OpenMS { if (it->isHigherScoreBetter()) { - String scannumber = String(it->getMetaValue("spectrum_reference")); + String scannumber = getScanIdentifier(it, peptide_ids.begin()); int charge = it->getHits().front().getCharge(); int label = 1; double hyperscore = it->getHits().front().getScore(); @@ -506,7 +496,7 @@ id label ScanNr lnrSp deltLCn deltCn lnExpect Xcorr Sp IonFrac Mass PepLen Charg ss << "Charge" << j << ","; } - String featureset = "id,label,ScanNr,lnrSp,deltLCn,deltCn,lnExpect,Xcorr,Sp,IonFrac,Mass,PepLen, + String featureset = "id,label,ScanNr,lnrSp,deltLCn,deltCn,lnExpect,Xcorr,Sp,IonFrac,Mass,PepLen," + ss.str() + "enzN,enzC,enzInt,lnNumSP,dM,absdM,peptide,proteinId1"; StringList txt_header = ListUtils::create(featureset); @@ -519,21 +509,22 @@ id label ScanNr lnrSp deltLCn deltCn lnExpect Xcorr Sp IonFrac Mass PepLen Charg double deltaLCn = 0; for (vector::iterator jt = it->getHits().begin(); jt != it->getHits().end(); ++jt) { - deltaLCn += jt->getMetaValue("MS:1002253"); + deltaLCn += double(jt->getMetaValue("MS:1002253")); } it->sort(); it->assignRanks(); - String scannumber = String(it->getMetaValue("spectrum_reference")); + String scannumber = getScanIdentifier(it, peptide_ids.begin()); + std::vector hits = it->getHits(); for (vector::iterator jt = hits.begin(); jt != hits.end(); ++jt) { StringList idents; - sid.push_back(it->getBaseName()); - sid.push_back(scannumber); - sid.push_back(String(jt->getRank())); + idents.push_back(it->getBaseName()); + idents.push_back(scannumber); + idents.push_back(String(jt->getRank())); String sid = ListUtils::concatenate(idents, "_"); int charge = jt->getCharge(); int label = 1; - if (jt->metaValueExists("target_decoy") && jt->getMetaValue("target_decoy").hasSubstring("decoy")) + if (jt->metaValueExists("target_decoy") && String(jt->getMetaValue("target_decoy")).hasSubstring("decoy")) { label = -1; } @@ -544,14 +535,14 @@ id label ScanNr lnrSp deltLCn deltCn lnExpect Xcorr Sp IonFrac Mass PepLen Charg //TODO in comet pep.xml consumption get deltaCn //deltLCn deltaCn between first and last, i.e. sum in peptidehit //lnExpect - String lnExpect = String(log(jt->getMetaValue("MS:1002257"))); + String lnExpect = String(log(double(jt->getMetaValue("MS:1002257")))); //Sp String sp = String(jt->getMetaValue("MS:1002255")); //lnrSp log n rank Sp - String lnrSp = String(log(jt->getMetaValue("MS:1002256"))); + String lnrSp = String(log(double(jt->getMetaValue("MS:1002256")))); //TODO in comet pep.xml consumption get SP rank into MetaValue //IonFrac - String ionfrac = jt->getMetaValue("MS:1002258")/jt->getMetaValue("MS:1002259"); + String ionfrac = double(jt->getMetaValue("MS:1002258"))/double(jt->getMetaValue("MS:1002259")); //TODO in comet pep.xml consumption get matched ions and total ions //Mass double mass = jt->getSequence().getMonoWeight(Residue::Full, charge)/charge; @@ -572,21 +563,21 @@ id label ScanNr lnrSp deltLCn deltCn lnExpect Xcorr Sp IonFrac Mass PepLen Charg } } //enzN - bool enzN = isEnz(peptide.at(0), peptide.at(2), enz); + bool enzN = isEnz(jt->getPeptideEvidences().front().getAABefore(), jt->getSequence().getPrefix(1).toString().c_str()[0], enz); //enzC - bool enzC = isEnz(peptide.at(peptide.size() - 3), peptide.at(peptide.size() - 1), enz); + bool enzC = isEnz(jt->getSequence().getSuffix(1).toString().c_str()[0], jt->getPeptideEvidences().front().getAAAfter(), enz); //enzInt - int enzInt = countEnzymatic(sequence, enz); + int enzInt = countEnzymatic(jt->getSequence().toUnmodifiedString(), enz); //lnNumSP //this is practically not obtainable, as this seems to be the logn of the number of //internally matched decoy or target hits to that spectrum query depending on the current hit itself //is approximated by number of matched peptides - String lnNumSP = String(log(jt->getMetaValue("matched_peptides"))); + String lnNumSP = String(log(double(jt->getMetaValue("matched_peptides")))); //TODO in comet pep.xml consumption get matched_peptides into PeptideHit //dM double dm = it->getMZ() - mass; //absdM - double absdM = abs(dm); + double absdm = abs(dm); //peptide String sequence = ""; sequence += String(jt->getPeptideEvidences().front().getAABefore()); // just first peptide evidence @@ -594,9 +585,9 @@ id label ScanNr lnrSp deltLCn deltCn lnExpect Xcorr Sp IonFrac Mass PepLen Charg sequence += String(jt->getPeptideEvidences().front().getAAAfter()); //just first peptide evidence //proteinId1 String pepevid = ""; - for (vector::iterator kt = jt->getPeptideEvidences().begin(); kt != jt->getPeptideEvidences().end(); ++kt) + for (vector::const_iterator kt = jt->getPeptideEvidences().begin(); kt != jt->getPeptideEvidences().end(); ++kt) { - pev +=kt->getProteinAccession(); + pepevid += kt->getProteinAccession(); } StringList row; @@ -617,8 +608,8 @@ id label ScanNr lnrSp deltLCn deltCn lnExpect Xcorr Sp IonFrac Mass PepLen Charg row.push_back(String(enzC)); row.push_back(String(enzInt)); row.push_back(lnNumSP); - row.push_back(String(dM)); - row.push_back(String(absdM)); + row.push_back(String(dm)); + row.push_back(String(absdm)); row.push_back(sequence); row.push_back(pepevid); @@ -718,7 +709,7 @@ feature abbreviation feature description } // Function adapted from Enzyme.h in Percolator converter - size_t TopPerc::countEnzymatic(String peptide, string enz) + size_t TopPerc::countEnzymatic(String peptide, string& enz) { size_t count = 0; for (size_t ix = 1; ix < peptide.size(); ++ix) @@ -741,4 +732,19 @@ feature abbreviation feature description return featureValue * ((double)numerator / denominator); } + String TopPerc::getScanIdentifier(vector::iterator it, vector::iterator start) + { + String scannumber = it->getMetaValue("spectrum_reference"); + if (scannumber.empty()) + { + scannumber = String(it->getMetaValue("spectrum_id")); + if (scannumber.empty()) + { + scannumber = String(it - start + 1); + LOG_WARN << "no known spectrum identifiers, using index [1,n] - use at own risk." << endl; + } + } + } + + } From 541600a2ebb93e24540ffbab9c7985fffe8629b4 Mon Sep 17 00:00:00 2001 From: mwalzer Date: Fri, 27 Nov 2015 21:20:30 +0100 Subject: [PATCH 12/41] [FIX] fixed metavaluename for matched ions in comet usage, removed todos done --- src/openms/source/ANALYSIS/ID/TopPerc.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/TopPerc.cpp index edd7097b153..ab020380024 100644 --- a/src/openms/source/ANALYSIS/ID/TopPerc.cpp +++ b/src/openms/source/ANALYSIS/ID/TopPerc.cpp @@ -532,7 +532,6 @@ id label ScanNr lnrSp deltLCn deltCn lnExpect Xcorr Sp IonFrac Mass PepLen Charg String xcorr = String(jt->getMetaValue("MS:1002252")); //deltCn String deltaCn = String(jt->getMetaValue("MS:1002253")); - //TODO in comet pep.xml consumption get deltaCn //deltLCn deltaCn between first and last, i.e. sum in peptidehit //lnExpect String lnExpect = String(log(double(jt->getMetaValue("MS:1002257")))); @@ -540,10 +539,8 @@ id label ScanNr lnrSp deltLCn deltCn lnExpect Xcorr Sp IonFrac Mass PepLen Charg String sp = String(jt->getMetaValue("MS:1002255")); //lnrSp log n rank Sp String lnrSp = String(log(double(jt->getMetaValue("MS:1002256")))); - //TODO in comet pep.xml consumption get SP rank into MetaValue //IonFrac String ionfrac = double(jt->getMetaValue("MS:1002258"))/double(jt->getMetaValue("MS:1002259")); - //TODO in comet pep.xml consumption get matched ions and total ions //Mass double mass = jt->getSequence().getMonoWeight(Residue::Full, charge)/charge; //PepLen @@ -572,8 +569,7 @@ id label ScanNr lnrSp deltLCn deltCn lnExpect Xcorr Sp IonFrac Mass PepLen Charg //this is practically not obtainable, as this seems to be the logn of the number of //internally matched decoy or target hits to that spectrum query depending on the current hit itself //is approximated by number of matched peptides - String lnNumSP = String(log(double(jt->getMetaValue("matched_peptides")))); - //TODO in comet pep.xml consumption get matched_peptides into PeptideHit + String lnNumSP = String(log(double(jt->getMetaValue("num_matched_peptides")))); //dM double dm = it->getMZ() - mass; //absdM From c209934a94f340ec62b4eee40ffda91c35b69473 Mon Sep 17 00:00:00 2001 From: mwalzer Date: Fri, 27 Nov 2015 21:34:15 +0100 Subject: [PATCH 13/41] [NOP] minor naming consistency correction --- src/openms/source/ANALYSIS/ID/TopPerc.cpp | 24 +++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/TopPerc.cpp index ab020380024..eb3ffa66013 100644 --- a/src/openms/source/ANALYSIS/ID/TopPerc.cpp +++ b/src/openms/source/ANALYSIS/ID/TopPerc.cpp @@ -92,24 +92,24 @@ namespace OpenMS // Create header for the features string featureset = "SpecId, Label,ScanNr, RawScore, DeNovoScore,ScoreRatio, Energy,lnEValue,IsotopeError, lnExplainedIonCurrentRatio,lnNTermIonCurrentRatio,lnCTermIonCurrentRatio,lnMS2IonCurrent,Mass,PepLen,dM,absdM,MeanErrorTop7,sqMeanErrorTop7,StdevErrorTop7," + ss.str() ; - StringList txt_header0 = ListUtils::create(featureset); + StringList txt_header = ListUtils::create(featureset); if (addMHC) { - txt_header0.push_back("enzN"); - txt_header0.push_back("enzC"); - txt_header0.push_back("MHCLct"); - txt_header0.push_back("Peptide"); - txt_header0.push_back("Protein"); + txt_header.push_back("enzN"); + txt_header.push_back("enzC"); + txt_header.push_back("MHCLct"); + txt_header.push_back("Peptide"); + txt_header.push_back("Protein"); } else { - txt_header0.push_back("enzN"); - txt_header0.push_back("enzC"); - txt_header0.push_back("enzInt"); - txt_header0.push_back("Peptide"); - txt_header0.push_back("Protein"); + txt_header.push_back("enzN"); + txt_header.push_back("enzC"); + txt_header.push_back("enzInt"); + txt_header.push_back("Peptide"); + txt_header.push_back("Protein"); } - txt.addLine(ListUtils::concatenate(txt_header0, out_sep)); + txt.addLine(ListUtils::concatenate(txt_header, out_sep)); for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) { From 6557da3fc75630f5b17b8c1424acc9edd7b62a3e Mon Sep 17 00:00:00 2001 From: mwalzer Date: Sat, 28 Nov 2015 13:24:13 +0100 Subject: [PATCH 14/41] [FEATURE] added mascot feature set - not even close to mascotpercolator feature set due to a lot of missing information (lost in mascotadapter) --- .../include/OpenMS/ANALYSIS/ID/TopPerc.h | 2 + src/openms/source/ANALYSIS/ID/TopPerc.cpp | 134 ++++++++++++++++++ 2 files changed, 136 insertions(+) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h index 0c78e427b5c..ae7f12d2911 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h @@ -58,6 +58,8 @@ namespace OpenMS static size_t countEnzymatic(String peptide, std::string& enz); static double rescaleFragmentFeature(double featureValue, int NumMatchedMainIons); static String getScanIdentifier(std::vector::iterator it, std::vector::iterator start); + static void assignDeltaScore(std::vector& hits, String score_ref); + private: TopPerc(); virtual ~TopPerc(); diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/TopPerc.cpp index eb3ffa66013..309227d7b40 100644 --- a/src/openms/source/ANALYSIS/ID/TopPerc.cpp +++ b/src/openms/source/ANALYSIS/ID/TopPerc.cpp @@ -639,7 +639,124 @@ feature abbreviation feature description 17. seqCov Sequence coverage of matched ions (per ion series) 18. intMatched Matched ion intensity (per ion series) */ + // Create String of the charges for the header of the tab file + stringstream ss; + ss << "Charge" << minCharge << ", "; + for (int j = minCharge+1; j <= maxCharge; j++) + { + ss << "Charge" << j << ","; + } + + String featureset = "id,label,ScanNr,mass," + + ss.str() + + "mScore,dScore,deltaMass, absDeltaMass, uniqueToProt, enzN, enzC, enzInt, mod,sequence,protein"; + StringList txt_header = ListUtils::create(featureset); + // Insert the header with the features names to the file + txt.addLine(ListUtils::concatenate(txt_header, out_sep)); + + // get all the feature values + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + { + double deltaLCn = 0; + for (vector::iterator jt = it->getHits().begin(); jt != it->getHits().end(); ++jt) + { + deltaLCn += double(jt->getMetaValue("MS:1002253")); + } + it->sort(); + it->assignRanks(); + String scannumber = getScanIdentifier(it, peptide_ids.begin()); + it->sort(); + it->assignRanks(); + std::vector hits = it->getHits(); + assignDeltaScore(hits, "MS:1001171"); + for (vector::iterator jt = hits.begin(); jt != hits.end(); ++jt) + { + StringList idents; + idents.push_back(it->getBaseName()); + idents.push_back(scannumber); + idents.push_back(String(jt->getRank())); + String sid = ListUtils::concatenate(idents, "_"); + int label = 1; + if (jt->metaValueExists("target_decoy") && String(jt->getMetaValue("target_decoy")).hasSubstring("decoy")) + { + label = -1; + } + int charge = jt->getCharge(); + double mass = jt->getSequence().getMonoWeight(Residue::Full, charge)/charge; + //Chargen + StringList chargen; + // write 1 for the correct charge, 0 for other charges + for (int i = minCharge; i <= maxCharge; ++i) + { + if (charge != i) + { + chargen.push_back("0"); + } + else + { + chargen.push_back("1"); + } + } + double mScore = double(jt->getMetaValue("MS:1001171")); + double dScore = double(jt->getMetaValue("delta_score")); + double dm = it->getMZ() - mass; + double absdm = abs(dm); + //no isoDeltaM - no isotope error info available from mascot adapter + //no uniquePeps - no info from mascot substitute with sequence to protein uniqueness + String uniquePeps = "0"; + if (String(jt->getMetaValue("protein_references")) == "unique") + { + uniquePeps = "1"; + } + bool enzN = isEnz(jt->getPeptideEvidences().front().getAABefore(), jt->getSequence().getPrefix(1).toString().c_str()[0], enz); + //enzC + bool enzC = isEnz(jt->getSequence().getSuffix(1).toString().c_str()[0], jt->getPeptideEvidences().front().getAAAfter(), enz); + //enzInt + int enzInt = countEnzymatic(jt->getSequence().toUnmodifiedString(), enz); + //no totInt info available from mascot adapter + //no intMatchedTot info available from mascot adapter + //no relIntMatchedTot info available from mascot adapter + //no binom info available from mascot adapter + //no fragMassError info available from mascot adapter + //no absFragMassError info available from mascot adapter + //no fracIonsMatched info available from mascot adapter + //no seqCov info available from mascot adapter + //no intMatched info available from mascot adapter + + bool mod = jt->getSequence().isModified(); + String sequence = ""; + sequence += String(jt->getPeptideEvidences().front().getAABefore()); // just first peptide evidence + sequence += jt->getSequence().toString(); + sequence += String(jt->getPeptideEvidences().front().getAAAfter()); //just first peptide evidence + //proteinId1 + String pepevid = ""; + for (vector::const_iterator kt = jt->getPeptideEvidences().begin(); kt != jt->getPeptideEvidences().end(); ++kt) + { + pepevid += kt->getProteinAccession(); + } + + StringList row; + row.push_back(sid); + row.push_back(label); + row.push_back(scannumber); + row.push_back(String(mass)); + row.push_back(ListUtils::concatenate(chargen, out_sep)); + row.push_back(String(mScore)); + row.push_back(String(dScore)); + row.push_back(String(dm)); + row.push_back(String(absdm)); + row.push_back(String(uniquePeps)); + row.push_back(String(enzN)); + row.push_back(String(enzC)); + row.push_back(String(enzInt)); + row.push_back(String(mod)); + row.push_back(sequence); + row.push_back(pepevid); + + txt.addLine(ListUtils::concatenate(row, out_sep)); + } + } } // Function adapted from Enzyme.h in Percolator converter @@ -742,5 +859,22 @@ feature abbreviation feature description } } + void TopPerc::assignDeltaScore(vector& hits, String score_ref) + { + if (!hits.empty()) + { + vector::iterator prev = hits.begin(); + double prev_score = double(prev->getMetaValue(score_ref)); + for (vector::iterator jt = hits.begin()+1; jt != hits.end(); ++jt) + { + double cur_score = double(jt->getMetaValue(score_ref)); + double value = prev_score - cur_score; + prev->setMetaValue("delta_score",value); + prev = jt; + } + (hits.end()-1)->setMetaValue("delta_score",0.0); //if last hit or only one hit + } + } + } From 4fcc2e7012b02639c6aae64c8eee36c5e9269a19 Mon Sep 17 00:00:00 2001 From: mwalzer Date: Sun, 29 May 2016 13:28:35 +0200 Subject: [PATCH 15/41] [FIX] some refactoring and fixing the result storage --- .../include/OpenMS/ANALYSIS/ID/TopPerc.h | 67 ++++- src/openms/source/ANALYSIS/ID/TopPerc.cpp | 45 ++-- src/utils/TopPerc.cpp | 251 ++++++++++-------- 3 files changed, 233 insertions(+), 130 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h index ae7f12d2911..cba007bc969 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h @@ -48,13 +48,72 @@ namespace OpenMS { class OPENMS_DLLAPI TopPerc { + public: + struct PercolatorResult + { + String PSMId; + double score; + double qvalue; + double posterior_error_prob; + String peptide; + char preAA; + char postAA; + StringList proteinIds; + + PercolatorResult(const String& pid, const double s, const double q, const String& p, const char pre, const char pos, const StringList& pl): + PSMId (pid), + score (s), + qvalue (q), + peptide (p), + preAA (pre), + postAA (pos), + proteinIds (pl) + { + } + + PercolatorResult(StringList& row): + proteinIds() + { + // peptide sequence + StringList pep; + row[4].split(".", pep); + //TODO test pep size 3 + peptide = pep[1]; + preAA = pep[0]=="-"?'[':pep[0].c_str()[0]; // const char PeptideEvidence::N_TERMINAL_AA = '['; + postAA = pep[2]=="-"?']':pep[2].c_str()[0]; // const char PeptideEvidence::C_TERMINAL_AA = ']'; + // SVM-score + score = row[1].toDouble(); + // q-Value + qvalue = row[2].toDouble(); + // PEP + posterior_error_prob = row[3].toDouble(); + // scannr. as written in preparePIN + PSMId = row[0]; + proteinIds = std::vector(row.begin()+5,row.end()); + } + + bool operator!=(const TopPerc::PercolatorResult& rhs) const + { + if (PSMId != rhs.PSMId || score != rhs.score || qvalue != rhs.qvalue || + posterior_error_prob != rhs.posterior_error_prob || peptide != rhs.peptide || + proteinIds != rhs.proteinIds) + return true; + return false; + } + + bool operator==(const TopPerc::PercolatorResult& rhs) const + { + return !(operator !=(rhs)); + } + }; + public: static bool isEnz(const char& n, const char& c, std::string& enz); static void prepareCUSTOMpin(std::vector& peptide_ids, TextFile& txt, std::vector& user_param_features, char out_sep='\t'); - static void prepareMSGFpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int minCharge, int maxCharge, bool addMHC = false, char out_sep='\t'); - static void prepareXTANDEMpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int minCharge, int maxCharge, char out_sep='\t'); - static void prepareCOMETpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int minCharge, int maxCharge, char out_sep='\t'); - static void prepareMASCOTpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int minCharge, int maxCharge, char out_sep='\t'); + static void prepareMSGFpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int min_charge, int max_charge, bool addMHC = false, char out_sep='\t'); + static void prepareXTANDEMpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep='\t'); + static void prepareCOMETpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep='\t'); + static void prepareMASCOTpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep='\t'); static size_t countEnzymatic(String peptide, std::string& enz); static double rescaleFragmentFeature(double featureValue, int NumMatchedMainIons); static String getScanIdentifier(std::vector::iterator it, std::vector::iterator start); diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/TopPerc.cpp index 309227d7b40..e43042727fc 100644 --- a/src/openms/source/ANALYSIS/ID/TopPerc.cpp +++ b/src/openms/source/ANALYSIS/ID/TopPerc.cpp @@ -39,6 +39,10 @@ using namespace std; namespace OpenMS { + //TODO for all prepare* PSMId as written in PeptideIdentification::spectrum_reference + // and pre/post AA as - if begin/end of protein ([/] in PeptideEvidence) + //id label scannr feature1 ... featureN peptide proteinId1 .. proteinIdM + void TopPerc::prepareCUSTOMpin(vector& peptide_ids, TextFile& txt, vector& user_param_features, char out_sep) { // Create header for the features @@ -80,12 +84,12 @@ namespace OpenMS } } - void TopPerc::prepareMSGFpin(vector& peptide_ids, string& enz, TextFile& txt, int minCharge, int maxCharge, bool addMHC, char out_sep) + void TopPerc::prepareMSGFpin(vector& peptide_ids, string& enz, TextFile& txt, int min_charge, int max_charge, bool addMHC, char out_sep) { // Create String of the charges for the header of the tab file stringstream ss; - ss << "Charge" << minCharge << ", "; - for (int j = minCharge + 1; j < maxCharge + 1; j++) + ss << "Charge" << min_charge << ", "; + for (int j = min_charge + 1; j < max_charge + 1; j++) { ss << "Charge" << j << ","; } @@ -184,8 +188,8 @@ namespace OpenMS // write 1 for the correct charge, 0 for other charges // i.e.: charge 3 for charges from 2-5: 0 1 0 0 stringstream ss; - int i = minCharge; - while (i <= maxCharge) + int i = min_charge; + while (i <= max_charge) { if (charge != i) { @@ -249,12 +253,12 @@ namespace OpenMS } } - void TopPerc::prepareXTANDEMpin(vector& peptide_ids, string& enz, TextFile& txt, int minCharge, int maxCharge, char out_sep) + void TopPerc::prepareXTANDEMpin(vector& peptide_ids, string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep) { // Create String of the charges for the header of the tab file stringstream ss; - ss << "Charge" << minCharge << ", "; - for (int j = minCharge + 1; j < maxCharge + 1; j++) + ss << "Charge" << min_charge << ", "; + for (int j = min_charge + 1; j < max_charge + 1; j++) { ss << "Charge" << j << ","; @@ -356,8 +360,8 @@ namespace OpenMS // write 1 for the correct charge, 0 for other charges // i.e.: charge 3 for charges from 2-5: 0 1 0 0 stringstream ss; - int i = minCharge; - while (i <= maxCharge) + int i = min_charge; + while (i <= max_charge) { if (charge != i) { @@ -443,8 +447,8 @@ namespace OpenMS // write 1 for the correct charge, 0 for other charges // i.e: charge 3 for charges from 2-5: 0 1 0 0 stringstream ss; - int i = minCharge; - while (i <= maxCharge) + int i = min_charge; + while (i <= max_charge) { if (charge != i) { @@ -478,7 +482,7 @@ namespace OpenMS } } - void TopPerc::prepareCOMETpin(vector& peptide_ids, string& enz, TextFile& txt, int minCharge, int maxCharge, char out_sep) + void TopPerc::prepareCOMETpin(vector& peptide_ids, string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep) { /** -with decoy comet search id label ScanNr lnrSp deltLCn deltCn lnExpect Xcorr Sp IonFrac Mass PepLen Charge1 Charge2 Charge3 Charge4 Charge5 Charge6 enzN enzC enzInt lnNumSP dM absdM peptide proteinId1 @@ -490,8 +494,8 @@ id label ScanNr lnrSp deltLCn deltCn lnExpect Xcorr Sp IonFrac Mass PepLen Charg // Create String of the charges for the header of the tab file stringstream ss; - ss << "Charge" << minCharge << ", "; - for (int j = minCharge+1; j <= maxCharge; j++) + ss << "Charge" << min_charge << ", "; + for (int j = min_charge+1; j <= max_charge; j++) { ss << "Charge" << j << ","; } @@ -548,7 +552,7 @@ id label ScanNr lnrSp deltLCn deltCn lnExpect Xcorr Sp IonFrac Mass PepLen Charg //Chargen StringList chargen; // write 1 for the correct charge, 0 for other charges - for (int i = minCharge; i <= maxCharge; ++i) + for (int i = min_charge; i <= max_charge; ++i) { if (charge != i) { @@ -614,7 +618,7 @@ id label ScanNr lnrSp deltLCn deltCn lnExpect Xcorr Sp IonFrac Mass PepLen Charg } } - void TopPerc::prepareMASCOTpin(vector& peptide_ids, string& enz, TextFile& txt, int minCharge, int maxCharge, char out_sep) + void TopPerc::prepareMASCOTpin(vector& peptide_ids, string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep) { /** Features 1-9 Represent the Basic Feature Set and Features 10-18 Represent the Extended Feature Set As Used in Mascot Percolator @@ -641,8 +645,8 @@ feature abbreviation feature description */ // Create String of the charges for the header of the tab file stringstream ss; - ss << "Charge" << minCharge << ", "; - for (int j = minCharge+1; j <= maxCharge; j++) + ss << "Charge" << min_charge << ", "; + for (int j = min_charge+1; j <= max_charge; j++) { ss << "Charge" << j << ","; } @@ -687,7 +691,7 @@ feature abbreviation feature description //Chargen StringList chargen; // write 1 for the correct charge, 0 for other charges - for (int i = minCharge; i <= maxCharge; ++i) + for (int i = min_charge; i <= max_charge; ++i) { if (charge != i) { @@ -857,6 +861,7 @@ feature abbreviation feature description LOG_WARN << "no known spectrum identifiers, using index [1,n] - use at own risk." << endl; } } + return scannumber; } void TopPerc::assignDeltaScore(vector& hits, String score_ref) diff --git a/src/utils/TopPerc.cpp b/src/utils/TopPerc.cpp index 08deb04c1cd..e6806db08ec 100644 --- a/src/utils/TopPerc.cpp +++ b/src/utils/TopPerc.cpp @@ -108,14 +108,13 @@ class TOPPPercolator : protected: void registerOptionsAndFlags_() { - registerInputFile_("percolator_executable", "", "", "Path to the percolator binary", true, false, ListUtils::create("skipexists")); registerInputFile_("in", "", "", "Input target file", true); setValidFormats_("in", ListUtils::create("mzid")); registerInputFile_("in_decoy", "", "", "Input decoy file", false); setValidFormats_("in_decoy", ListUtils::create("mzid")); registerOutputFile_("out", "", "", "Output file", true); std::string enzs = "no_enzyme,elastase,pepsin,proteinasek,thermolysin,chymotrypsin,lys-n,lys-c,arg-c,asp-n,glu-c,trypsin"; - registerStringOption_("enzyme", "", "trypsin", "Type of enzyme: "+enzs , false, true); + registerStringOption_("enzyme", "", "trypsin", "Type of enzyme: "+enzs , false); setValidStrings_("enzyme", ListUtils::create(enzs)); registerInputFile_("percolator_executable", "", // choose the default value according to the platform where it will be executed @@ -246,7 +245,6 @@ class TOPPPercolator : //------------------------------------------------------------- // read more input if necessary //------------------------------------------------------------- - //TODO check if this comes from the same search engine! if (!in_decoy.empty()) { vector decoy_peptide_ids; @@ -261,6 +259,48 @@ class TOPPPercolator : LOG_WARN << "Converting from mzid: you might experience loss of information depending on the capabilities of the target format." << endl; MzIdentMLFile().load(in_decoy, decoy_protein_ids, decoy_peptide_ids); } + + //paranoia check if this comes from the same search engine! + { + if (decoy_protein_ids.front().getSearchEngine() != protein_ids.front().getSearchEngine() ) + { + LOG_WARN << "Warning about differing SearchEngine between target and decoy run" << endl; + } + if (decoy_protein_ids.front().getScoreType() != protein_ids.front().getScoreType() ) + { + LOG_WARN << "Warning about differing SoreType between target and decoy run" << endl; + } + if (decoy_protein_ids.front().getPrimaryMSRunPath() != protein_ids.front().getPrimaryMSRunPath() ) + { + LOG_WARN << "Warning about differing SearchInput between target and decoy run" << endl; + } + if (decoy_protein_ids.front().getSearchParameters().digestion_enzyme != protein_ids.front().getSearchParameters().digestion_enzyme ) + { + LOG_WARN << "Warning about differing DigestionEnzyme between target and decoy run" << endl; + } + if (decoy_protein_ids.front().getSearchParameters().variable_modifications != protein_ids.front().getSearchParameters().variable_modifications ) + { + LOG_WARN << "Warning about differing VarMods between target and decoy run" << endl; + } + if (decoy_protein_ids.front().getSearchParameters().fixed_modifications != protein_ids.front().getSearchParameters().fixed_modifications ) + { + LOG_WARN << "Warning about differing FixMods between target and decoy run" << endl; + } + if (decoy_protein_ids.front().getSearchParameters().charges != protein_ids.front().getSearchParameters().charges ) + { + LOG_WARN << "Warning about differing SearchCharges between target and decoy run" << endl; + } + if (decoy_protein_ids.front().getSearchParameters().fragment_mass_tolerance != protein_ids.front().getSearchParameters().fragment_mass_tolerance ) + { + LOG_WARN << "Warning about differing FragTol between target and decoy run" << endl; + } + if (decoy_protein_ids.front().getSearchParameters().precursor_tolerance != protein_ids.front().getSearchParameters().precursor_tolerance ) + { + LOG_WARN << "Warning about differing PrecTol between target and decoy run" << endl; + } + } + + //being paranoid about the presence of target decoy denominations, which are crucial to the percolator process for (std::vector::iterator pit = decoy_peptide_ids.begin(); pit != decoy_peptide_ids.end(); ++pit) { for (std::vector::iterator pht = pit->getHits().begin(); pht != pit->getHits().end(); ++pht) @@ -269,7 +309,7 @@ class TOPPPercolator : //TODO what about proteins - internal target decoy handling is shitty - rework pls } } - //TODO this is going to fail with specrum_reference clashes if not handled _REALLY_ carefully + //TODO check overlap of ids in terms of spectrum id/reference peptide_ids.insert( peptide_ids.end(), decoy_peptide_ids.begin(), decoy_peptide_ids.end() ); protein_ids.insert( protein_ids.end(), decoy_protein_ids.begin(), decoy_protein_ids.end() ); writeLog_("Using decoy hits from separate file."); @@ -277,8 +317,6 @@ class TOPPPercolator : else { writeLog_("Using decoy hits from input id file. You did you use a target decoy search, did you?"); -// printUsage_(); -// return ILLEGAL_PARAMETERS; } @@ -286,33 +324,34 @@ class TOPPPercolator : // extract search engine and prepare pin //------------------------------------------------------------- String se = protein_ids.front().getSearchEngine(); + LOG_DEBUG << "Registered search engine: " << se << endl; TextFile txt; //TODO introduce min/max charge to parameters for now take available range - int maxCharge = 0; - int minCharge = 10; + int max_charge = 0; + int min_charge = 10; for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) { for (vector::const_iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) { - if (hit->getCharge() > maxCharge) + if (hit->getCharge() > max_charge) { - maxCharge = hit->getCharge(); + max_charge = hit->getCharge(); } - if (hit->getCharge() < minCharge) + if (hit->getCharge() < min_charge) { - minCharge = hit->getCharge(); + min_charge = hit->getCharge(); } } } + LOG_DEBUG << "Using min/max charges of " << min_charge << "/" << max_charge << endl; - std::string enz_str = getStringOption_("enzyme"); - writeDebug_("Detected search engine: " + se , 2); - if (se == "MS-GF+") TopPerc::prepareMSGFpin(peptide_ids, enz_str, txt, minCharge, maxCharge, getFlag_("MHC")); -// if (se == "Mascot") prepareMASCOTpin(peptide_ids, txt, minCharge, maxCharge); - if (se == "XTandem") TopPerc::prepareXTANDEMpin(peptide_ids, enz_str, txt, minCharge, maxCharge); + string enz_str = getStringOption_("enzyme"); - writeLog_( "Executing percolator!"); + //TODO introduce custom feature selection from TopPerc::prepareCUSTOMpin to parameters + if (se == "MS-GF+") TopPerc::prepareMSGFpin(peptide_ids, enz_str, txt, min_charge, max_charge, getFlag_("MHC")); + if (se == "Mascot") TopPerc::prepareMASCOTpin(peptide_ids, enz_str, txt, min_charge, max_charge); + if (se == "XTandem") TopPerc::prepareXTANDEMpin(peptide_ids, enz_str, txt, min_charge, max_charge); // create temp directory to store percolator in file pin.tab temporarily String temp_directory_body = QDir::toNativeSeparators((File::getTempDirectory() + "/" + File::getUniqueName() + "/").toQString()); // body for the tmp files @@ -320,61 +359,64 @@ class TOPPPercolator : QDir d; d.mkpath(temp_directory_body.toQString()); } - String txt_designator = File::getUniqueName(); String pin_file(temp_directory_body + txt_designator + "_pin.tab"); String pout_file(temp_directory_body + txt_designator + "_pout.tab"); - - // File is stored in temp directory txt.store(pin_file); QStringList arguments; // Check all set parameters and get them into arguments StringList - arguments << "-r" << pout_file.toQString(); - if (getFlag_("e")) arguments << "-e"; - if (getFlag_("Z")) arguments << "-Z"; - if (getDoubleOption_("p") != 0.0) arguments << "-p" << String(getDoubleOption_("p")).toQString(); - if (getDoubleOption_("n") != 0.0) arguments << "-n" << String(getDoubleOption_("n")).toQString(); - if (getDoubleOption_("F") != 0.01) arguments << "-F" << String(getDoubleOption_("F")).toQString(); - if (getDoubleOption_("t") != 0.01) arguments << "-t" << String(getDoubleOption_("t")).toQString(); - if (getIntOption_("i") != 0) arguments << "-i" << String(getIntOption_("i")).toQString(); - if (getFlag_("x")) arguments << "-x"; - if (getDoubleOption_("f") != 0.6) arguments << "-f" << String(getDoubleOption_("f")).toQString(); - if (getStringOption_("J") != "") arguments << "-J" << getStringOption_("J").toQString(); - if (getStringOption_("k") != "") arguments << "-k" << getStringOption_("k").toQString(); - if (getStringOption_("w") != "") arguments << "-w" << getStringOption_("w").toQString(); - if (getStringOption_("W") != "") arguments << "-W" << getStringOption_("W").toQString(); - if (getStringOption_("V") != "") arguments << "-V" << getStringOption_("V").toQString(); - if (getIntOption_("v") != 2) arguments << "-v" << String(getIntOption_("v")).toQString(); - if (getFlag_("u")) arguments << "-u"; - if (getFlag_("R")) arguments << "-R"; - if (getFlag_("O")) arguments << "-O"; - if (getIntOption_("S") != 1) arguments << "-S" << String(getDoubleOption_("S")).toQString(); - if (getFlag_("K")) arguments << "-K"; - if (getFlag_("D")) arguments << "-D"; - if (getStringOption_("B") != "") arguments << "-B" << getStringOption_("B").toQString(); - if (getFlag_("U")) arguments << "-U"; - if (getFlag_("s")) arguments << "-s"; - if (getFlag_("A")) arguments << "-A"; - if (getDoubleOption_("a") != 0.0) arguments << "-a" << String(getDoubleOption_("a")).toQString(); - if (getDoubleOption_("b") != 0.0) arguments << "-b" << String(getDoubleOption_("b")).toQString(); - if (getDoubleOption_("G") != 0.0) arguments << "-G" << String(getDoubleOption_("G")).toQString(); - if (getFlag_("g")) arguments << "-g"; - if (getFlag_("I")) arguments << "-I"; - if (getFlag_("q")) arguments << "-q"; - if (getFlag_("N")) arguments << "-N"; - if (getFlag_("E")) arguments << "-E"; - if (getFlag_("C")) arguments << "-C"; - if (getIntOption_("d") != 0) arguments << "-d" << String(getIntOption_("d")).toQString(); - if (getStringOption_("P") != "random") arguments << "-P" << getStringOption_("P").toQString(); - if (getFlag_("T")) arguments << "-T"; - if (getFlag_("Y")) arguments << "-Y"; - if (getFlag_("H")) arguments << "-H"; - if (getFlag_("fido-truncation")) arguments << "--fido-truncation"; - if (getFlag_("Q")) arguments << "-Q"; - arguments << "-U"; - arguments << pin_file.toQString(); + { + arguments << "-r" << pout_file.toQString(); + if (getFlag_("e")) arguments << "-e"; + if (getFlag_("Z")) arguments << "-Z"; + if (getDoubleOption_("p") != 0.0) arguments << "-p" << String(getDoubleOption_("p")).toQString(); + if (getDoubleOption_("n") != 0.0) arguments << "-n" << String(getDoubleOption_("n")).toQString(); + if (getDoubleOption_("F") != 0.01) arguments << "-F" << String(getDoubleOption_("F")).toQString(); + if (getDoubleOption_("t") != 0.01) arguments << "-t" << String(getDoubleOption_("t")).toQString(); + if (getIntOption_("i") != 0) arguments << "-i" << String(getIntOption_("i")).toQString(); + if (getFlag_("x")) arguments << "-x"; + if (getDoubleOption_("f") != 0.6) arguments << "-f" << String(getDoubleOption_("f")).toQString(); + if (getStringOption_("J") != "") arguments << "-J" << getStringOption_("J").toQString(); + if (getStringOption_("k") != "") arguments << "-k" << getStringOption_("k").toQString(); + if (getStringOption_("w") != "") arguments << "-w" << getStringOption_("w").toQString(); + if (getStringOption_("W") != "") arguments << "-W" << getStringOption_("W").toQString(); + if (getStringOption_("V") != "") arguments << "-V" << getStringOption_("V").toQString(); + if (getIntOption_("v") != 2) arguments << "-v" << String(getIntOption_("v")).toQString(); + if (getFlag_("u")) arguments << "-u"; + if (getFlag_("R")) arguments << "-R"; + if (getFlag_("O")) arguments << "-O"; + if (getIntOption_("S") != 1) arguments << "-S" << String(getDoubleOption_("S")).toQString(); + if (getFlag_("K")) arguments << "-K"; + if (getFlag_("D")) arguments << "-D"; + if (getStringOption_("B") != "") arguments << "-B" << getStringOption_("B").toQString(); + if (getFlag_("U")) arguments << "-U"; + if (getFlag_("s")) arguments << "-s"; + if (getFlag_("A")) arguments << "-A"; + if (getDoubleOption_("a") != 0.0) arguments << "-a" << String(getDoubleOption_("a")).toQString(); + if (getDoubleOption_("b") != 0.0) arguments << "-b" << String(getDoubleOption_("b")).toQString(); + if (getDoubleOption_("G") != 0.0) arguments << "-G" << String(getDoubleOption_("G")).toQString(); + if (getFlag_("g")) arguments << "-g"; + if (getFlag_("I")) arguments << "-I"; + if (getFlag_("q")) arguments << "-q"; + if (getFlag_("N")) arguments << "-N"; + if (getFlag_("E")) arguments << "-E"; + if (getFlag_("C")) arguments << "-C"; + if (getIntOption_("d") != 0) arguments << "-d" << String(getIntOption_("d")).toQString(); + if (getStringOption_("P") != "random") arguments << "-P" << getStringOption_("P").toQString(); + if (getFlag_("T")) arguments << "-T"; + if (getFlag_("Y")) arguments << "-Y"; + if (getFlag_("H")) arguments << "-H"; + if (getFlag_("fido-truncation")) arguments << "--fido-truncation"; + if (getFlag_("Q")) arguments << "-Q"; + arguments << "-U"; + arguments << pin_file.toQString(); + } + writeLog_("Prepared percolator input."); + //------------------------------------------------------------- + // run percolator + //------------------------------------------------------------- // Percolator execution with the executable ant the arguments StringList int status = QProcess::execute(percolator_executable.toQString(), arguments); // does automatic escaping etc... if (status != 0) @@ -392,31 +434,27 @@ class TOPPPercolator : } return EXTERNAL_PROGRAM_ERROR; } + writeLog_("Executed percolator!"); + + //------------------------------------------------------------- + // reintegrate pout results + //------------------------------------------------------------- // when percolator finished calculation, it stores the results -r option (with or without -U) or -m (which seems to be not working) CsvFile csv_file(pout_file, '\t'); - map > pep_map; + map > pep_map; StringList row; for (size_t i = 1; i < csv_file.rowCount(); ++i) { csv_file.getRow(i, row); - vector row_values; - // peptide - row_values.push_back(row[4].chop(2).reverse().chop(2).reverse()); -// writeDebug_("sequence: " + row[4].chop(2).reverse().chop(2).reverse(), 99); - // SVM-score - row_values.push_back(row[1]); - // q-Value - row_values.push_back(row[2]); - // PEP - row_values.push_back(row[3]); - - vector substr; - row[0].split('_', substr); -// writeDebug_("Mapping input to key: " + substr[2] , 2); - pep_map[substr[2]] = row_values; // scannr. as written in preparePIN + TopPerc::PercolatorResult res(row); + if (pep_map.find(res.PSMId) == pep_map.end()) + { + pep_map[res.PSMId] = vector(); + } + pep_map[res.PSMId].push_back(res); } // As the percolator output file is not needed anymore, the temporary directory is going to be deleted @@ -436,56 +474,57 @@ class TOPPPercolator : String sid = it->getMetaValue("spectrum_reference"); if (pep_map.find(sid) == pep_map.end()) { - //writeDebug_("No suitable PeptideIdentification entry 1st found for " + sid , 2); + LOG_DEBUG << "No suitable PeptideIdentification entry found for .pout entry " << sid; vector sr; sid.split('=', sr); sid = sr.back(); + LOG_DEBUG << " - retry with " << sid << endl; if (pep_map.find(sid) == pep_map.end()) { - //writeDebug_("No suitable PeptideIdentification entry 2nd found for " + sid + " - emulate percolator scores with exisiting scores?", 111); ++c_debug; - writeDebug_("No suitable PeptideIdentification entry for " + sid, 3); + LOG_DEBUG << "Also none found" << endl; continue; } } - it->setScoreType("q-value"); - it->setHigherScoreBetter(false); - AASequence aat; -// writeDebug_("sequence: " + pep_map[sid][0], 99); - aat = AASequence::fromString(pep_map[sid][0]); -// writeDebug_("sequence: " + aat.toString(), 99); - + //check each PeptideHit for compliance with one of the PercolatorResults (by sequence) for (vector::iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) { - if (hit->getSequence() == aat) + String pis = hit->getSequence().toUnmodifiedString(); + for (vector::iterator pr = pep_map.find(sid)->second.begin(); pr != pep_map.find(sid)->second.end(); ++pr) { - //get aa before/after/charge and metainfo - hit->setMetaValue("MS:1001492", pep_map[sid][1].toDouble()); //svm score - double qv = pep_map[sid][2].toDouble(); // q-value - hit->setMetaValue("MS:1001491", qv); - hit->setMetaValue("prepercolatorscore", hit->getScore()); - writeDebug_("found peptide and wrote percolator scoring from "+String(hit->getScore())+" to "+String(qv), 99); - hit->setScore(qv); - hit->setMetaValue("MS:1001493", pep_map[sid][3].toDouble()); //pep - //writeDebug_("found peptide and wrote percolator scoring", 99); + if (pis == pr->peptide && + pr->preAA == hit->getPeptideEvidences().front().getAABefore() && + pr->postAA == hit->getPeptideEvidences().front().getAAAfter()) + { + hit->setMetaValue("MS:1001492", pr->score); // svm score + hit->setMetaValue("MS:1001491", pr->qvalue); // percolator q value + hit->setMetaValue("MS:1001493", pr->posterior_error_prob); // percolator pep + } } - else writeDebug_(aat.toString()+" - found nothing and wrote no percolator scoring", 99); } } - writeDebug_("No suitable PeptideIdentification for " + String(c_debug) + " out of " + String(peptide_ids.size()), 2); + + LOG_DEBUG << "No suitable PeptideIdentification for " << c_debug << " out of " << peptide_ids.size() << endl; for (vector::iterator it = protein_ids.begin(); it != protein_ids.end(); ++it) { - it->setSearchEngine("Percolator"); + //will not be set because ALL decoy hits got no new score + //it->setSearchEngine("Percolator"); + //it->setScoreType("q-value"); + //it->setHigherScoreBetter(false); + //TODO add software percolator and topperc + it->setMetaValue("percolator", "TopPerc"); + ProteinIdentification::SearchParameters sp = it->getSearchParameters(); + //TODO write all percolator parameters as set here in sp + it->setSearchParameters(sp); } // Storing the PeptideHits with calculated q-value, pep and svm score MzIdentMLFile().store(getStringOption_("out").toQString().toStdString(), protein_ids, peptide_ids); - LOG_INFO << "TopPerc finished successfully!" << endl; - + writeLog_("TopPerc finished successfully!"); return EXECUTION_OK; } From 1c5bfc4b3f35999b3bcb61ae8868e851fea41e30 Mon Sep 17 00:00:00 2001 From: mwalzer Date: Sun, 29 May 2016 22:28:01 +0200 Subject: [PATCH 16/41] [FIX] multi engine percolating like in my pyOpenMS script, but little better (prep pin missing tho) --- .../include/OpenMS/ANALYSIS/ID/TopPerc.h | 20 +++ src/openms/source/ANALYSIS/ID/TopPerc.cpp | 139 +++++++++++++++ src/utils/TopPerc.cpp | 161 +++++++++++------- 3 files changed, 255 insertions(+), 65 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h index cba007bc969..16e4fbfb5f7 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h @@ -118,10 +118,30 @@ namespace OpenMS static double rescaleFragmentFeature(double featureValue, int NumMatchedMainIons); static String getScanIdentifier(std::vector::iterator it, std::vector::iterator start); static void assignDeltaScore(std::vector& hits, String score_ref); + static void mergeMULTIids(std::vector >& protein_ids_list, std::vector >& peptide_ids_list); + + struct lq_ProteinHit + { + inline bool operator() (const ProteinHit& h1, const ProteinHit& h2) + { + return (h1.getAccession() < h2.getAccession()); + } + }; + + struct lq_PeptideEvidence + { + inline bool operator() (const PeptideEvidence& h1, const PeptideEvidence& h2) + { + return (h1.getProteinAccession() < h2.getProteinAccession()); + } + }; private: TopPerc(); virtual ~TopPerc(); + + + }; } //namespace OpenMS diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/TopPerc.cpp index e43042727fc..8684b703c7c 100644 --- a/src/openms/source/ANALYSIS/ID/TopPerc.cpp +++ b/src/openms/source/ANALYSIS/ID/TopPerc.cpp @@ -881,5 +881,144 @@ feature abbreviation feature description } } + void TopPerc::mergeMULTIids(vector >& protein_ids_list, vector >& peptide_ids_list) + { + //both input parameters must correspond + if (peptide_ids_list.size() != protein_ids_list.size()) + { + throw Exception::ElementNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, "Protein and Peptide Identification mismatch"); + } + //search parameters of all runs must correspond (considering front() of each only) + for (size_t i=1; i < protein_ids_list.size(); ++i) + { + if( protein_ids_list[i-1].front().getSearchParameters().db != protein_ids_list[i].front().getSearchParameters().db ) + { + throw Exception::ElementNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, protein_ids_list[i-1].front().getSearchParameters().db+"!="+protein_ids_list[i].front().getSearchParameters().db); + } + } + + //setup map of merge characteristics per spectrum + std::map unified; + + string common = "q-value_score, expect_score"; + StringList commonMetaValues = ListUtils::create(common); + for (vector >::iterator pilit = peptide_ids_list.begin(); pilit != peptide_ids_list.end(); ++pilit) + { + String SE = protein_ids_list[distance(peptide_ids_list.begin(), pilit)].front().getSearchEngine(); + for (vector::iterator pit = pilit->begin(); pit != pilit->end(); ++pit) + { + PeptideIdentification ins = *pit; + //prepare for merge + for (vector::iterator hit = ins.getHits().begin(); hit != ins.getHits().end(); ++hit) + { + //move score from each hit to meta value + hit->setMetaValue(SE + ":" + ins.getScoreType(), hit->getScore()); + //set score in each hit to #SE hits + hit->setScore(1); + //rename common meta values (to SE:commonmetavaluename) + for (size_t i = 0; i < commonMetaValues.size(); ++i) + { + if (hit->metaValueExists(commonMetaValues[i])) + { + DataValue val = hit->getMetaValue(commonMetaValues[i]); + hit->setMetaValue(SE+":"+commonMetaValues[i],val); + hit->removeMetaValue(commonMetaValues[i]); + } + } + } + ins.setScoreType("multiple"); + String spectrum_reference = ins.getMetaValue("spectrum_reference"); + //merge in unified map + if (unified.find(spectrum_reference) == unified.end()) + { + unified[spectrum_reference] = ins; + } + else + { + //find corresponding hit + for (vector::iterator hit = ins.getHits().begin(); hit != ins.getHits().end(); ++hit) + { + for (vector::iterator merger = unified[spectrum_reference].getHits().begin(); merger != unified[spectrum_reference].getHits().end(); ++merger) + { + if (hit->getSequence()==merger->getSequence()) + { + //care for peptide evidences!! set would be okay if checked for same search db in parameters, +// vector pev; +// pev.reserve(max(hit->getPeptideEvidences().size(),merger->getPeptideEvidences().size())); +// std::vector::iterator uni; +// std::sort(merger->getPeptideEvidences().begin(),merger->getPeptideEvidences().end(), TopPerc::lq_PeptideEvidence); +// std::sort(hit->getPeptideEvidences().begin(),hit->getPeptideEvidences().end(), TopPerc::lq_PeptideEvidence); +// uni = std::set_union(swop.front().getHits().begin(), swop.front().getHits().end(), +// it->front().getHits().begin(),it->front().getHits().end(), pev.begin(), +// TopPerc::lq_PeptideEvidence); +// pev.resize(uni-pev.begin()); +// merger->setPeptideEvidences(pev); + //There is no mutable getPeptideEvidences() accessor in PeptideHit - above will not werk, but so long: + //Implying PeptideIndexer was applied (with the same search db each) will care for that all PeptideEvidences from two hits with equal AASequence are the same + + //merge meta values + vector< String > keys; + hit->getKeys(keys); + for (vector::const_iterator kt = keys.begin(); kt != keys.end(); ++kt) + { + if (!merger->metaValueExists(*kt)) + { + merger->setMetaValue(*kt, hit->getMetaValue(*kt)); + } + } + merger->setScore(merger->getScore() + hit->getScore()); + break; + } + } + } + } + } + } + vector swip; + swip.reserve(unified.size()); + for (std::map::iterator it = unified.begin(); it != unified.end(); ++it) + { + swip.push_back(it->second); + } + peptide_ids_list.front().swap(swip); + peptide_ids_list.resize(1); + + //care for search parameters!! + vector swop; + swop.push_back(ProteinIdentification()); + for (vector >::iterator it = protein_ids_list.begin(); it != protein_ids_list.end(); ++it) + { + std::vector v; + v.reserve(max(swop.front().getHits().size(), it->front().getHits().size())); + std::vector::iterator uni; + std::sort(it->front().getHits().begin(),it->front().getHits().end(), TopPerc::lq_ProteinHit()); + uni = std::set_union(swop.front().getHits().begin(), swop.front().getHits().end(), + it->front().getHits().begin(),it->front().getHits().end(), v.begin(), + TopPerc::lq_ProteinHit()); + v.resize(uni-v.begin()); + swap(swop.front().getHits(),v); + ProteinIdentification::SearchParameters sp = it->front().getSearchParameters(); + String SE = it->front().getSearchEngine(); + {//insert into MetaInfo as SE:param + swop.front().setMetaValue(SE+":db",sp.db); + swop.front().setMetaValue(SE+":db_version",sp.db_version); + swop.front().setMetaValue(SE+":taxonomy",sp.taxonomy); + swop.front().setMetaValue(SE+":charges",sp.charges); + swop.front().setMetaValue(SE+":fixed_modifications",ListUtils::concatenate(sp.fixed_modifications, ",")); + swop.front().setMetaValue(SE+":variable_modifications",ListUtils::concatenate(sp.variable_modifications, ",")); + swop.front().setMetaValue(SE+":missed_cleavages",sp.missed_cleavages); + swop.front().setMetaValue(SE+":fragment_mass_tolerance",sp.fragment_mass_tolerance); + swop.front().setMetaValue(SE+":fragment_mass_tolerance_ppm",sp.fragment_mass_tolerance_ppm); + swop.front().setMetaValue(SE+":precursor_tolerance",sp.precursor_tolerance); + swop.front().setMetaValue(SE+":precursor_mass_tolerance_ppm",sp.precursor_mass_tolerance_ppm); + swop.front().setMetaValue(SE+":digestion_enzyme",sp.digestion_enzyme.getName()); + } + swop.front().setPrimaryMSRunPath(it->front().getPrimaryMSRunPath()); + swop.front().setSearchEngine("multiple"); + } + protein_ids_list.front().swap(swop); + protein_ids_list.resize(1); + + } } diff --git a/src/utils/TopPerc.cpp b/src/utils/TopPerc.cpp index e6806db08ec..33536ece40a 100644 --- a/src/utils/TopPerc.cpp +++ b/src/utils/TopPerc.cpp @@ -31,7 +31,7 @@ // $Maintainer: Mathias Walzer $ // $Authors: Andreas Simon, Mathias Walzer $ // -------------------------------------------------------------------------- - +#include #include #include @@ -108,8 +108,8 @@ class TOPPPercolator : protected: void registerOptionsAndFlags_() { - registerInputFile_("in", "", "", "Input target file", true); - setValidFormats_("in", ListUtils::create("mzid")); + registerInputFileList_("in", "", StringList(), "Input file(s)", true); + setValidFormats_("in", ListUtils::create("mzid,idXML")); registerInputFile_("in_decoy", "", "", "Input decoy file", false); setValidFormats_("in_decoy", ListUtils::create("mzid")); registerOutputFile_("out", "", "", "Output file", true); @@ -184,9 +184,9 @@ class TOPPPercolator : //------------------------------------------------------------- // parsing parameters //------------------------------------------------------------- - const String in = getStringOption_("in"); + const StringList in_list = getStringList_("in"); const String in_decoy = getStringOption_("in_decoy"); - writeDebug_(String("Input file of target: ") + in + " " + in_decoy, 2); + LOG_DEBUG << "Input file (of target?): " << ListUtils::concatenate(in_list, ",") << " & " << in_decoy << " (decoy)" << endl; const String percolator_executable(getStringOption_("percolator_executable")); writeDebug_(String("Path to the percolator: ") + percolator_executable, 2); @@ -200,55 +200,64 @@ class TOPPPercolator : //------------------------------------------------------------- // read input //------------------------------------------------------------- - FileHandler fh; - FileTypes::Type in_type = fh.getType(in); - if (in_type == FileTypes::IDXML) + vector > peptide_ids_list; + vector > protein_ids_list; + for (size_t i = 0; i < in_list.size(); ++i) { - IdXMLFile().load(in, protein_ids, peptide_ids); - } - else if (in_type == FileTypes::MZIDENTML) - { - LOG_WARN << "Converting from mzid: you might experience loss of information depending on the capabilities of the target format." << endl; - MzIdentMLFile().load(in, protein_ids, peptide_ids); - } - //else catched by TOPPBase:registerInput being mandatory mzid or idxml + String in = in_list[i]; + FileHandler fh; + FileTypes::Type in_type = fh.getType(in); + if (in_type == FileTypes::IDXML) + { + IdXMLFile().load(in, protein_ids, peptide_ids); + } + else if (in_type == FileTypes::MZIDENTML) + { + LOG_WARN << "Converting from mzid: you might experience loss of information depending on the capabilities of the target format." << endl; + MzIdentMLFile().load(in, protein_ids, peptide_ids); + } + //else catched by TOPPBase:registerInput being mandatory mzid or idxml - if (peptide_ids.empty()) - { - writeLog_("No or empty input file specified. Aborting!"); - printUsage_(); - return ILLEGAL_PARAMETERS; - } + if (peptide_ids.empty()) + { + writeLog_("No or empty input file specified. Aborting!"); + printUsage_(); + return ILLEGAL_PARAMETERS; + } - //being paranoid about the presence of target decoy denominations, which are crucial to the percolator process - for (std::vector::iterator pit = peptide_ids.begin(); pit != peptide_ids.end(); ++pit) - { - for (vector::iterator pht = pit->getHits().begin(); pht != pit->getHits().end(); ++pht) + //being paranoid about the presence of target decoy denominations, which are crucial to the percolator process + for (std::vector::iterator pit = peptide_ids.begin(); pit != peptide_ids.end(); ++pit) { - // Some Hits have no NumMatchedMainIons, and MeanError, etc. values. Have to ignore them! - if (!pht->metaValueExists("target_decoy")) + for (vector::iterator pht = pit->getHits().begin(); pht != pit->getHits().end(); ++pht) { - if (!in_decoy.empty()) + // Some Hits have no NumMatchedMainIons, and MeanError, etc. values. Have to ignore them! + if (!pht->metaValueExists("target_decoy")) { - pht->setMetaValue("target_decoy", "target"); - } - else - { - writeLog_("No target decoy search results discrimination possible. Aborting!"); - printUsage_(); - return ILLEGAL_PARAMETERS; + if (!in_decoy.empty()) + { + pht->setMetaValue("target_decoy", "target"); + } + else + { + writeLog_("No target decoy search results discrimination possible. Aborting!"); + printUsage_(); + return ILLEGAL_PARAMETERS; + } } } } + peptide_ids_list.push_back(peptide_ids); + protein_ids_list.push_back(protein_ids); } //------------------------------------------------------------- // read more input if necessary //------------------------------------------------------------- - if (!in_decoy.empty()) + if (!in_decoy.empty() && in_list.size() == 1) { vector decoy_peptide_ids; vector decoy_protein_ids; + FileHandler fh; FileTypes::Type in_decoy_type = fh.getType(in_decoy); if (in_decoy_type == FileTypes::IDXML) { @@ -260,41 +269,41 @@ class TOPPPercolator : MzIdentMLFile().load(in_decoy, decoy_protein_ids, decoy_peptide_ids); } - //paranoia check if this comes from the same search engine! + //paranoia check if this comes from the same search engine! (only in the first proteinidentification of the first proteinidentifications vector vector) { - if (decoy_protein_ids.front().getSearchEngine() != protein_ids.front().getSearchEngine() ) + if (decoy_protein_ids.front().getSearchEngine() != protein_ids_list.front().front().getSearchEngine() ) { LOG_WARN << "Warning about differing SearchEngine between target and decoy run" << endl; } - if (decoy_protein_ids.front().getScoreType() != protein_ids.front().getScoreType() ) + if (decoy_protein_ids.front().getScoreType() != protein_ids_list.front().front().getScoreType() ) { LOG_WARN << "Warning about differing SoreType between target and decoy run" << endl; } - if (decoy_protein_ids.front().getPrimaryMSRunPath() != protein_ids.front().getPrimaryMSRunPath() ) + if (decoy_protein_ids.front().getPrimaryMSRunPath() != protein_ids_list.front().front().getPrimaryMSRunPath() ) { LOG_WARN << "Warning about differing SearchInput between target and decoy run" << endl; } - if (decoy_protein_ids.front().getSearchParameters().digestion_enzyme != protein_ids.front().getSearchParameters().digestion_enzyme ) + if (decoy_protein_ids.front().getSearchParameters().digestion_enzyme != protein_ids_list.front().front().getSearchParameters().digestion_enzyme ) { LOG_WARN << "Warning about differing DigestionEnzyme between target and decoy run" << endl; } - if (decoy_protein_ids.front().getSearchParameters().variable_modifications != protein_ids.front().getSearchParameters().variable_modifications ) + if (decoy_protein_ids.front().getSearchParameters().variable_modifications != protein_ids_list.front().front().getSearchParameters().variable_modifications ) { LOG_WARN << "Warning about differing VarMods between target and decoy run" << endl; } - if (decoy_protein_ids.front().getSearchParameters().fixed_modifications != protein_ids.front().getSearchParameters().fixed_modifications ) + if (decoy_protein_ids.front().getSearchParameters().fixed_modifications != protein_ids_list.front().front().getSearchParameters().fixed_modifications ) { LOG_WARN << "Warning about differing FixMods between target and decoy run" << endl; } - if (decoy_protein_ids.front().getSearchParameters().charges != protein_ids.front().getSearchParameters().charges ) + if (decoy_protein_ids.front().getSearchParameters().charges != protein_ids_list.front().front().getSearchParameters().charges ) { LOG_WARN << "Warning about differing SearchCharges between target and decoy run" << endl; } - if (decoy_protein_ids.front().getSearchParameters().fragment_mass_tolerance != protein_ids.front().getSearchParameters().fragment_mass_tolerance ) + if (decoy_protein_ids.front().getSearchParameters().fragment_mass_tolerance != protein_ids_list.front().front().getSearchParameters().fragment_mass_tolerance ) { LOG_WARN << "Warning about differing FragTol between target and decoy run" << endl; } - if (decoy_protein_ids.front().getSearchParameters().precursor_tolerance != protein_ids.front().getSearchParameters().precursor_tolerance ) + if (decoy_protein_ids.front().getSearchParameters().precursor_tolerance != protein_ids_list.front().front().getSearchParameters().precursor_tolerance ) { LOG_WARN << "Warning about differing PrecTol between target and decoy run" << endl; } @@ -310,8 +319,8 @@ class TOPPPercolator : } } //TODO check overlap of ids in terms of spectrum id/reference - peptide_ids.insert( peptide_ids.end(), decoy_peptide_ids.begin(), decoy_peptide_ids.end() ); - protein_ids.insert( protein_ids.end(), decoy_protein_ids.begin(), decoy_protein_ids.end() ); + peptide_ids_list.front().insert( peptide_ids.end(), decoy_peptide_ids.begin(), decoy_peptide_ids.end() ); + protein_ids_list.front().insert( protein_ids.end(), decoy_protein_ids.begin(), decoy_protein_ids.end() ); writeLog_("Using decoy hits from separate file."); } else @@ -323,24 +332,35 @@ class TOPPPercolator : //------------------------------------------------------------- // extract search engine and prepare pin //------------------------------------------------------------- - String se = protein_ids.front().getSearchEngine(); + String se = protein_ids_list.front().front().getSearchEngine(); + for (vector >::iterator pilit = protein_ids_list.begin(); pilit != protein_ids_list.end(); ++pilit) + { + if (se != protein_ids_list.front().front().getSearchEngine()) + { + se = "multiple"; + break; + } + } LOG_DEBUG << "Registered search engine: " << se << endl; TextFile txt; //TODO introduce min/max charge to parameters for now take available range int max_charge = 0; int min_charge = 10; - for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + for (vector >::iterator pilit = peptide_ids_list.begin(); pilit != peptide_ids_list.end(); ++pilit) { - for (vector::const_iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) + for (vector::iterator it = pilit->begin(); it != pilit->end(); ++it) { - if (hit->getCharge() > max_charge) - { - max_charge = hit->getCharge(); - } - if (hit->getCharge() < min_charge) + for (vector::const_iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) { - min_charge = hit->getCharge(); + if (hit->getCharge() > max_charge) + { + max_charge = hit->getCharge(); + } + if (hit->getCharge() < min_charge) + { + min_charge = hit->getCharge(); + } } } } @@ -348,10 +368,21 @@ class TOPPPercolator : string enz_str = getStringOption_("enzyme"); + //ignore all but first input if NOT multiple for now + if (se == "multiple") + { + TopPerc::mergeMULTIids(protein_ids_list,peptide_ids_list); // will collapse the list (reference) + //TopPerc::prepareMULTIpin(peptide_ids_list.front(), enz_str, txt, min_charge, max_charge); + } //TODO introduce custom feature selection from TopPerc::prepareCUSTOMpin to parameters - if (se == "MS-GF+") TopPerc::prepareMSGFpin(peptide_ids, enz_str, txt, min_charge, max_charge, getFlag_("MHC")); - if (se == "Mascot") TopPerc::prepareMASCOTpin(peptide_ids, enz_str, txt, min_charge, max_charge); - if (se == "XTandem") TopPerc::prepareXTANDEMpin(peptide_ids, enz_str, txt, min_charge, max_charge); + else if (se == "MS-GF+") TopPerc::prepareMSGFpin(peptide_ids_list.front(), enz_str, txt, min_charge, max_charge, getFlag_("MHC")); + else if (se == "Mascot") TopPerc::prepareMASCOTpin(peptide_ids_list.front(), enz_str, txt, min_charge, max_charge); + else if (se == "XTandem") TopPerc::prepareXTANDEMpin(peptide_ids_list.front(), enz_str, txt, min_charge, max_charge); + else + { + writeLog_("No known input to create percolator features from. Aborting"); + return INCOMPATIBLE_INPUT_DATA; + } // create temp directory to store percolator in file pin.tab temporarily String temp_directory_body = QDir::toNativeSeparators((File::getTempDirectory() + "/" + File::getUniqueName() + "/").toQString()); // body for the tmp files @@ -469,7 +500,7 @@ class TOPPPercolator : // Add the percolator results to the peptide vector of the original input file size_t c_debug = 0; - for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + for (vector::iterator it = peptide_ids_list.front().begin(); it != peptide_ids_list.front().end(); ++it) { String sid = it->getMetaValue("spectrum_reference"); if (pep_map.find(sid) == pep_map.end()) @@ -505,9 +536,9 @@ class TOPPPercolator : } } - LOG_DEBUG << "No suitable PeptideIdentification for " << c_debug << " out of " << peptide_ids.size() << endl; + LOG_DEBUG << "No suitable PeptideIdentification for " << c_debug << " out of " << peptide_ids_list.front().size() << endl; - for (vector::iterator it = protein_ids.begin(); it != protein_ids.end(); ++it) + for (vector::iterator it = protein_ids_list.front().begin(); it != protein_ids_list.front().end(); ++it) { //will not be set because ALL decoy hits got no new score //it->setSearchEngine("Percolator"); @@ -522,7 +553,7 @@ class TOPPPercolator : } // Storing the PeptideHits with calculated q-value, pep and svm score - MzIdentMLFile().store(getStringOption_("out").toQString().toStdString(), protein_ids, peptide_ids); + MzIdentMLFile().store(getStringOption_("out").toQString().toStdString(), protein_ids_list.front(), peptide_ids_list.front()); writeLog_("TopPerc finished successfully!"); return EXECUTION_OK; From 4dbd06743e1151f31675acab9c2fbefd1256c1ed Mon Sep 17 00:00:00 2001 From: mwalzer Date: Mon, 30 May 2016 16:43:30 +0200 Subject: [PATCH 17/41] [FIX] minimal set multifeature for prepareMULTIpin --- .../include/OpenMS/ANALYSIS/ID/TopPerc.h | 1 + src/openms/source/ANALYSIS/ID/TopPerc.cpp | 154 ++++++++++++++++++ src/utils/TopPerc.cpp | 2 +- 3 files changed, 156 insertions(+), 1 deletion(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h index 16e4fbfb5f7..30e47cc5160 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h @@ -114,6 +114,7 @@ namespace OpenMS static void prepareXTANDEMpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep='\t'); static void prepareCOMETpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep='\t'); static void prepareMASCOTpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep='\t'); + static void prepareMULTIpin(std::vector& peptide_ids, ProteinIdentification& protein_id, std::string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep='\t'); static size_t countEnzymatic(String peptide, std::string& enz); static double rescaleFragmentFeature(double featureValue, int NumMatchedMainIons); static String getScanIdentifier(std::vector::iterator it, std::vector::iterator start); diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/TopPerc.cpp index 8684b703c7c..68b0e39ddd6 100644 --- a/src/openms/source/ANALYSIS/ID/TopPerc.cpp +++ b/src/openms/source/ANALYSIS/ID/TopPerc.cpp @@ -1000,6 +1000,7 @@ feature abbreviation feature description ProteinIdentification::SearchParameters sp = it->front().getSearchParameters(); String SE = it->front().getSearchEngine(); {//insert into MetaInfo as SE:param + swop.front().setMetaValue("SE:"+SE,it->front().getSearchEngineVersion()); swop.front().setMetaValue(SE+":db",sp.db); swop.front().setMetaValue(SE+":db_version",sp.db_version); swop.front().setMetaValue(SE+":taxonomy",sp.taxonomy); @@ -1021,4 +1022,157 @@ feature abbreviation feature description } + void TopPerc::prepareMULTIpin(vector& peptide_ids, ProteinIdentification& protein_id, string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep) + { + //------------------------------------------------------------- + // header + //------------------------------------------------------------- + // Create String of the charges for the header of the tab file + stringstream ss; + ss << "Charge" << min_charge << ", "; + for (int j = min_charge+1; j <= max_charge; j++) + { + ss << "Charge" << j << ","; + } + + StringList ses_used; + StringList se_specifics; + StringList keys; + protein_id.getKeys(keys); + + if (ListUtils::contains(keys, "MS-GF+")) + { + ses_used.push_back("MS-GF+"); + se_specifics.push_back("MS:1002049"); // rawscore + se_specifics.push_back("MS:1002053"); // evalue + } + if (ListUtils::contains(keys, "Mascot")) + { + ses_used.push_back("Mascot"); + se_specifics.push_back("Mascot_score"); + se_specifics.push_back("EValue"); + } + if (ListUtils::contains(keys, "Comet")) + { + ses_used.push_back("Comet"); + se_specifics.push_back("MS:1002252"); //xcorr + se_specifics.push_back("MS:1002257"); //evalue + } + if (ListUtils::contains(keys, "XTandem")) + { + ses_used.push_back("XTandem"); + se_specifics.push_back("XTandem_score"); + se_specifics.push_back("E-Value"); + } + + LOG_INFO << "Using " << ListUtils::concatenate(ses_used, ", ") << " as source for search engine specific features." << endl; + + String featureset = "id,label,ScanNr," + + ListUtils::concatenate(se_specifics, ",") + + ss.str() + + "ionfrac,mass,enzN,enzC,enzInt,numHits,dM,absdM,PepLen,peptide,proteinId1"; + StringList txt_header = ListUtils::create(featureset); + // Insert the header with the features names to the file + txt.addLine(ListUtils::concatenate(txt_header, out_sep)); + + //------------------------------------------------------------- + // values + //------------------------------------------------------------- + // get all the feature values + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + { + it->sort(); + it->assignRanks(); + String scannumber = getScanIdentifier(it, peptide_ids.begin()); + std::vector hits = it->getHits(); + for (vector::iterator jt = hits.begin(); jt != hits.end(); ++jt) + { + StringList idents; + idents.push_back(it->getBaseName()); + idents.push_back(scannumber); + idents.push_back(String(jt->getRank())); + String sid = ListUtils::concatenate(idents, "_"); + int charge = jt->getCharge(); + int label = 1; + if (jt->metaValueExists("target_decoy") && String(jt->getMetaValue("target_decoy")).hasSubstring("decoy")) + { + label = -1; + } + + StringList sesp; + for (StringList::iterator s = se_specifics.begin(); s != se_specifics.end(); ++s) + { + sesp.push_back(String(jt->getMetaValue(*s))); + } + + StringList chargen; + // write 1 for the correct charge, 0 for other charges + for (int i = min_charge; i <= max_charge; ++i) + { + if (charge != i) + { + chargen.push_back("0"); + } + else + { + chargen.push_back("1"); + } + } + + //IonFrac + String ionfrac = String(double(jt->getMetaValue("matched_intensity"))/double(jt->getMetaValue("sum_intensity"))); // also consider "matched_ion_number"/"peak_number" + //Mass + double mass = jt->getSequence().getMonoWeight(Residue::Full, charge)/charge; + //enzN + bool enzN = isEnz(jt->getPeptideEvidences().front().getAABefore(), jt->getSequence().getPrefix(1).toString().c_str()[0], enz); + //enzC + bool enzC = isEnz(jt->getSequence().getSuffix(1).toString().c_str()[0], jt->getPeptideEvidences().front().getAAAfter(), enz); + //enzInt + int enzInt = countEnzymatic(jt->getSequence().toUnmodifiedString(), enz); + //numHits + int numHits = jt->getScore(); + //dM + double dm = it->getMZ() - mass; + //absdM + double absdm = abs(dm); + //PepLen + int peplen = jt->getSequence().size(); + //peptide + String sequence = ""; + //replace flanking aa if [ or ] with - + char pb = jt->getPeptideEvidences().front().getAABefore(); + sequence += pb=='['?"-.":String(pb); // just first peptide evidence + sequence += jt->getSequence().toString(); + char pa = jt->getPeptideEvidences().front().getAAAfter(); + sequence += pa==']'?".-":String(pa); // just first peptide evidence + //proteinId1 + StringList pepevid; + for (vector::const_iterator kt = jt->getPeptideEvidences().begin(); kt != jt->getPeptideEvidences().end(); ++kt) + { + pepevid.push_back(kt->getProteinAccession()); + } + + StringList row; + row.push_back(sid); + row.push_back(label); + row.push_back(scannumber); + row.push_back(ListUtils::concatenate(sesp, out_sep)); + row.push_back(ListUtils::concatenate(chargen, out_sep)); + row.push_back(ionfrac); + row.push_back(String(mass)); + row.push_back(String(enzN)); + row.push_back(String(enzC)); + row.push_back(String(enzInt)); + row.push_back(String(numHits)); + row.push_back(String(dm)); + row.push_back(String(absdm)); + row.push_back(String(peplen)); + row.push_back(sequence); + row.push_back(ListUtils::concatenate(pepevid, out_sep)); + + txt.addLine(ListUtils::concatenate(row, out_sep)); + } + } + } + } diff --git a/src/utils/TopPerc.cpp b/src/utils/TopPerc.cpp index 33536ece40a..5d1d1ee7f52 100644 --- a/src/utils/TopPerc.cpp +++ b/src/utils/TopPerc.cpp @@ -372,7 +372,7 @@ class TOPPPercolator : if (se == "multiple") { TopPerc::mergeMULTIids(protein_ids_list,peptide_ids_list); // will collapse the list (reference) - //TopPerc::prepareMULTIpin(peptide_ids_list.front(), enz_str, txt, min_charge, max_charge); + TopPerc::prepareMULTIpin(peptide_ids_list.front(), protein_ids_list.front().front(), enz_str, txt, min_charge, max_charge); } //TODO introduce custom feature selection from TopPerc::prepareCUSTOMpin to parameters else if (se == "MS-GF+") TopPerc::prepareMSGFpin(peptide_ids_list.front(), enz_str, txt, min_charge, max_charge, getFlag_("MHC")); From 5ac059307aa7e616013ef014ccc8bbb9e1a74d07 Mon Sep 17 00:00:00 2001 From: mwalzer Date: Mon, 30 May 2016 21:11:29 +0200 Subject: [PATCH 18/41] [FIX] removed mapping bugs, improved on logging informative quality --- .../include/OpenMS/ANALYSIS/ID/TopPerc.h | 2 +- src/openms/source/ANALYSIS/ID/TopPerc.cpp | 90 +++++++++++++------ src/utils/TopPerc.cpp | 29 +++--- 3 files changed, 81 insertions(+), 40 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h index 30e47cc5160..7188c720ee6 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h @@ -119,7 +119,7 @@ namespace OpenMS static double rescaleFragmentFeature(double featureValue, int NumMatchedMainIons); static String getScanIdentifier(std::vector::iterator it, std::vector::iterator start); static void assignDeltaScore(std::vector& hits, String score_ref); - static void mergeMULTIids(std::vector >& protein_ids_list, std::vector >& peptide_ids_list); + static void mergeMULTIids(std::vector >& protein_ids_list, std::vector >& peptide_ids_list, bool skip_checks=false); struct lq_ProteinHit { diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/TopPerc.cpp index 68b0e39ddd6..e4b1b0b7736 100644 --- a/src/openms/source/ANALYSIS/ID/TopPerc.cpp +++ b/src/openms/source/ANALYSIS/ID/TopPerc.cpp @@ -861,7 +861,7 @@ feature abbreviation feature description LOG_WARN << "no known spectrum identifiers, using index [1,n] - use at own risk." << endl; } } - return scannumber; + return scannumber.removeWhitespaces(); } void TopPerc::assignDeltaScore(vector& hits, String score_ref) @@ -881,28 +881,34 @@ feature abbreviation feature description } } - void TopPerc::mergeMULTIids(vector >& protein_ids_list, vector >& peptide_ids_list) + void TopPerc::mergeMULTIids(vector >& protein_ids_list, vector >& peptide_ids_list, bool skip_checks) { //both input parameters must correspond if (peptide_ids_list.size() != protein_ids_list.size()) { throw Exception::ElementNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, "Protein and Peptide Identification mismatch"); } + //search parameters of all runs must correspond (considering front() of each only) - for (size_t i=1; i < protein_ids_list.size(); ++i) + if (!skip_checks) { - if( protein_ids_list[i-1].front().getSearchParameters().db != protein_ids_list[i].front().getSearchParameters().db ) + for (size_t i=1; i < protein_ids_list.size(); ++i) { - throw Exception::ElementNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, protein_ids_list[i-1].front().getSearchParameters().db+"!="+protein_ids_list[i].front().getSearchParameters().db); + if( protein_ids_list[i-1].front().getSearchParameters().db != protein_ids_list[i].front().getSearchParameters().db ) + { + throw Exception::ElementNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, protein_ids_list[i-1].front().getSearchParameters().db+"!="+protein_ids_list[i].front().getSearchParameters().db); + } } } + LOG_DEBUG << "creating spectrum map" << endl; + //setup map of merge characteristics per spectrum std::map unified; - string common = "q-value_score, expect_score"; + std::string common = "q-value_score, expect_score"; StringList commonMetaValues = ListUtils::create(common); - for (vector >::iterator pilit = peptide_ids_list.begin(); pilit != peptide_ids_list.end(); ++pilit) + for (std::vector >::iterator pilit = peptide_ids_list.begin(); pilit != peptide_ids_list.end(); ++pilit) { String SE = protein_ids_list[distance(peptide_ids_list.begin(), pilit)].front().getSearchEngine(); for (vector::iterator pit = pilit->begin(); pit != pilit->end(); ++pit) @@ -927,6 +933,7 @@ feature abbreviation feature description } } ins.setScoreType("multiple"); + ins.setIdentifier("TopPerc_multiple_SE_input"); String spectrum_reference = ins.getMetaValue("spectrum_reference"); //merge in unified map if (unified.find(spectrum_reference) == unified.end()) @@ -974,31 +981,53 @@ feature abbreviation feature description } } } - vector swip; + LOG_DEBUG << "filled spectrum map" << endl; + std::vector swip; swip.reserve(unified.size()); + LOG_DEBUG << "merging spectrum map" << endl; for (std::map::iterator it = unified.begin(); it != unified.end(); ++it) { swip.push_back(it->second); } peptide_ids_list.front().swap(swip); peptide_ids_list.resize(1); + LOG_DEBUG << "Now containing " << peptide_ids_list.front().size() << " spectra identifications."<< endl; + LOG_DEBUG << "merging search parameters" << endl; //care for search parameters!! - vector swop; + + std::vector swop; swop.push_back(ProteinIdentification()); - for (vector >::iterator it = protein_ids_list.begin(); it != protein_ids_list.end(); ++it) + swop.back().setIdentifier("TopPerc_multiple_SE_input"); + swop.back().setDateTime(DateTime::currentDateTime()); + swop.back().setSearchParameters(protein_ids_list.front().front().getSearchParameters()); + for (std::vector >::iterator it = protein_ids_list.begin(); it != protein_ids_list.end(); ++it) { std::vector v; - v.reserve(max(swop.front().getHits().size(), it->front().getHits().size())); + v.resize(swop.front().getHits().size() + it->front().getHits().size()); std::vector::iterator uni; std::sort(it->front().getHits().begin(),it->front().getHits().end(), TopPerc::lq_ProteinHit()); - uni = std::set_union(swop.front().getHits().begin(), swop.front().getHits().end(), + LOG_DEBUG << "Sorted next part of the ProteinHits." << endl; + LOG_DEBUG << "Melting with that many previous ProteinHits. " << swop.front().getHits().size() << endl; + if (swop.front().getHits().empty()) + { + v.swap(it->front().getHits()); + } + else + { + uni = std::set_union(swop.front().getHits().begin(), swop.front().getHits().end(), it->front().getHits().begin(),it->front().getHits().end(), v.begin(), TopPerc::lq_ProteinHit()); - v.resize(uni-v.begin()); + v.resize(uni-v.begin()); + } + LOG_DEBUG << "Melting ProteinHits." << endl; + swap(swop.front().getHits(),v); + LOG_DEBUG << "Done with next ProteinHits." << endl; + ProteinIdentification::SearchParameters sp = it->front().getSearchParameters(); String SE = it->front().getSearchEngine(); + LOG_DEBUG << "Melting Parameters from " << SE << " into MetaInfo." << endl; {//insert into MetaInfo as SE:param swop.front().setMetaValue("SE:"+SE,it->front().getSearchEngineVersion()); swop.front().setMetaValue(SE+":db",sp.db); @@ -1016,9 +1045,11 @@ feature abbreviation feature description } swop.front().setPrimaryMSRunPath(it->front().getPrimaryMSRunPath()); swop.front().setSearchEngine("multiple"); + LOG_DEBUG << "Done with next Parameters." << endl; } protein_ids_list.front().swap(swop); protein_ids_list.resize(1); + LOG_DEBUG << "All merging finished." << endl; } @@ -1040,25 +1071,25 @@ feature abbreviation feature description StringList keys; protein_id.getKeys(keys); - if (ListUtils::contains(keys, "MS-GF+")) + if (ListUtils::contains(keys, "SE:MS-GF+")) { ses_used.push_back("MS-GF+"); se_specifics.push_back("MS:1002049"); // rawscore se_specifics.push_back("MS:1002053"); // evalue } - if (ListUtils::contains(keys, "Mascot")) + if (ListUtils::contains(keys, "SE:Mascot")) { ses_used.push_back("Mascot"); se_specifics.push_back("Mascot_score"); se_specifics.push_back("EValue"); } - if (ListUtils::contains(keys, "Comet")) + if (ListUtils::contains(keys, "SE:Comet")) { ses_used.push_back("Comet"); se_specifics.push_back("MS:1002252"); //xcorr se_specifics.push_back("MS:1002257"); //evalue } - if (ListUtils::contains(keys, "XTandem")) + if (ListUtils::contains(keys, "SE:XTandem")) { ses_used.push_back("XTandem"); se_specifics.push_back("XTandem_score"); @@ -1068,7 +1099,7 @@ feature abbreviation feature description LOG_INFO << "Using " << ListUtils::concatenate(ses_used, ", ") << " as source for search engine specific features." << endl; String featureset = "id,label,ScanNr," - + ListUtils::concatenate(se_specifics, ",") + + ListUtils::concatenate(se_specifics, ",") + "," + ss.str() + "ionfrac,mass,enzN,enzC,enzInt,numHits,dM,absdM,PepLen,peptide,proteinId1"; StringList txt_header = ListUtils::create(featureset); @@ -1083,15 +1114,13 @@ feature abbreviation feature description { it->sort(); it->assignRanks(); - String scannumber = getScanIdentifier(it, peptide_ids.begin()); + String scanidentifier = getScanIdentifier(it, peptide_ids.begin()); + StringList idents; + scanidentifier.split("=",idents); + String scannr = idents.back(); std::vector hits = it->getHits(); for (vector::iterator jt = hits.begin(); jt != hits.end(); ++jt) { - StringList idents; - idents.push_back(it->getBaseName()); - idents.push_back(scannumber); - idents.push_back(String(jt->getRank())); - String sid = ListUtils::concatenate(idents, "_"); int charge = jt->getCharge(); int label = 1; if (jt->metaValueExists("target_decoy") && String(jt->getMetaValue("target_decoy")).hasSubstring("decoy")) @@ -1102,7 +1131,10 @@ feature abbreviation feature description StringList sesp; for (StringList::iterator s = se_specifics.begin(); s != se_specifics.end(); ++s) { - sesp.push_back(String(jt->getMetaValue(*s))); + if (jt->metaValueExists(*s)) + sesp.push_back(String(jt->getMetaValue(*s))); + else + sesp.push_back("-1"); } StringList chargen; @@ -1141,10 +1173,10 @@ feature abbreviation feature description String sequence = ""; //replace flanking aa if [ or ] with - char pb = jt->getPeptideEvidences().front().getAABefore(); - sequence += pb=='['?"-.":String(pb); // just first peptide evidence + sequence += pb=='['?"-.":String(pb)+"."; // just first peptide evidence sequence += jt->getSequence().toString(); char pa = jt->getPeptideEvidences().front().getAAAfter(); - sequence += pa==']'?".-":String(pa); // just first peptide evidence + sequence += pa==']'?".-":"."+String(pa); // just first peptide evidence //proteinId1 StringList pepevid; for (vector::const_iterator kt = jt->getPeptideEvidences().begin(); kt != jt->getPeptideEvidences().end(); ++kt) @@ -1153,9 +1185,9 @@ feature abbreviation feature description } StringList row; - row.push_back(sid); + row.push_back(scanidentifier+"_"+String(std::distance(hits.begin(),jt))); row.push_back(label); - row.push_back(scannumber); + row.push_back(scannr); row.push_back(ListUtils::concatenate(sesp, out_sep)); row.push_back(ListUtils::concatenate(chargen, out_sep)); row.push_back(ionfrac); diff --git a/src/utils/TopPerc.cpp b/src/utils/TopPerc.cpp index 5d1d1ee7f52..74e831d1403 100644 --- a/src/utils/TopPerc.cpp +++ b/src/utils/TopPerc.cpp @@ -171,6 +171,7 @@ class TOPPPercolator : registerFlag_("fido-truncation", "Proteins with a very low score (< 0.001) will be truncated (assigned 0.0 probability).(Only valid if option -A is active)", true); registerFlag_("Q", "Uses protein group level inference, each cluster of proteins is either present or not, therefore when grouping proteins discard all possible combinations for each group.(Only valid if option -A is active and -N is inactive).", true); registerFlag_("MHC", "Add a feature for MHC ligand properties to the specific PSM.", true); + registerFlag_("same_search_db", "Manual override to ckeck if same settings for multiple search engines were applied.", true); } ExitCodes main_(int, const char**) @@ -335,7 +336,7 @@ class TOPPPercolator : String se = protein_ids_list.front().front().getSearchEngine(); for (vector >::iterator pilit = protein_ids_list.begin(); pilit != protein_ids_list.end(); ++pilit) { - if (se != protein_ids_list.front().front().getSearchEngine()) + if (se != pilit->front().getSearchEngine()) { se = "multiple"; break; @@ -371,7 +372,8 @@ class TOPPPercolator : //ignore all but first input if NOT multiple for now if (se == "multiple") { - TopPerc::mergeMULTIids(protein_ids_list,peptide_ids_list); // will collapse the list (reference) + TopPerc::mergeMULTIids(protein_ids_list,peptide_ids_list, getFlag_("same_search_db")); // will collapse the list (reference) + LOG_DEBUG << "Merged to sizes " << protein_ids_list.size() << " and " << protein_ids_list.size() << endl; TopPerc::prepareMULTIpin(peptide_ids_list.front(), protein_ids_list.front().front(), enz_str, txt, min_charge, max_charge); } //TODO introduce custom feature selection from TopPerc::prepareCUSTOMpin to parameters @@ -472,6 +474,8 @@ class TOPPPercolator : // reintegrate pout results //------------------------------------------------------------- // when percolator finished calculation, it stores the results -r option (with or without -U) or -m (which seems to be not working) + // WARNING: The -r option cannot be used in conjunction with -U: no peptide level statistics are calculated, redirecting PSM level statistics to provided file instead. + CsvFile csv_file(pout_file, '\t'); map > pep_map; @@ -481,11 +485,14 @@ class TOPPPercolator : { csv_file.getRow(i, row); TopPerc::PercolatorResult res(row); - if (pep_map.find(res.PSMId) == pep_map.end()) + StringList spl; + res.PSMId.split("_",spl); + String spec_ref = spl.front(); + if (pep_map.find(spec_ref) == pep_map.end()) { - pep_map[res.PSMId] = vector(); + pep_map[spec_ref] = vector(); } - pep_map[res.PSMId].push_back(res); + pep_map[spec_ref].push_back(res); } // As the percolator output file is not needed anymore, the temporary directory is going to be deleted @@ -500,20 +507,21 @@ class TOPPPercolator : // Add the percolator results to the peptide vector of the original input file size_t c_debug = 0; + size_t cnt = 0; for (vector::iterator it = peptide_ids_list.front().begin(); it != peptide_ids_list.front().end(); ++it) { String sid = it->getMetaValue("spectrum_reference"); + sid = sid.removeWhitespaces(); if (pep_map.find(sid) == pep_map.end()) { - LOG_DEBUG << "No suitable PeptideIdentification entry found for .pout entry " << sid; + String sid_ = sid; vector sr; sid.split('=', sr); sid = sr.back(); - LOG_DEBUG << " - retry with " << sid << endl; if (pep_map.find(sid) == pep_map.end()) { ++c_debug; - LOG_DEBUG << "Also none found" << endl; + LOG_DEBUG << "No suitable PeptideIdentification entry found for .pout entry " << sid << " or " << sid_ << endl; continue; } } @@ -531,12 +539,13 @@ class TOPPPercolator : hit->setMetaValue("MS:1001492", pr->score); // svm score hit->setMetaValue("MS:1001491", pr->qvalue); // percolator q value hit->setMetaValue("MS:1001493", pr->posterior_error_prob); // percolator pep + ++cnt; } } } } - - LOG_DEBUG << "No suitable PeptideIdentification for " << c_debug << " out of " << peptide_ids_list.front().size() << endl; + LOG_INFO << "No suitable PeptideIdentification for " << c_debug << " out of " << peptide_ids_list.front().size() << endl; + LOG_INFO << "No suitable PeptideHits for " << cnt << " found." << endl; for (vector::iterator it = protein_ids_list.front().begin(); it != protein_ids_list.front().end(); ++it) { From 0f27ffba74033cd0ec004f0bed11eb7a9d4c10af Mon Sep 17 00:00:00 2001 From: mwalzer Date: Sat, 11 Jun 2016 15:34:18 +0200 Subject: [PATCH 19/41] [FEAT] added concat pin for topperc --- .../include/OpenMS/ANALYSIS/ID/TopPerc.h | 1 + src/openms/source/ANALYSIS/ID/TopPerc.cpp | 160 +++++++++++++++++- src/utils/TopPerc.cpp | 15 +- 3 files changed, 172 insertions(+), 4 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h index 7188c720ee6..3563161508f 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h @@ -115,6 +115,7 @@ namespace OpenMS static void prepareCOMETpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep='\t'); static void prepareMASCOTpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep='\t'); static void prepareMULTIpin(std::vector& peptide_ids, ProteinIdentification& protein_id, std::string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep='\t'); + static void prepareCONCATpin(std::vector >& peptide_id_list, std::vector >& protein_id_list, std::string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep='\t'); static size_t countEnzymatic(String peptide, std::string& enz); static double rescaleFragmentFeature(double featureValue, int NumMatchedMainIons); static String getScanIdentifier(std::vector::iterator it, std::vector::iterator start); diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/TopPerc.cpp index e4b1b0b7736..56200edecaf 100644 --- a/src/openms/source/ANALYSIS/ID/TopPerc.cpp +++ b/src/openms/source/ANALYSIS/ID/TopPerc.cpp @@ -40,7 +40,7 @@ using namespace std; namespace OpenMS { //TODO for all prepare* PSMId as written in PeptideIdentification::spectrum_reference - // and pre/post AA as - if begin/end of protein ([/] in PeptideEvidence) + // and pre/post AA as - if begin/end of protein ([/] in PeptideEvidence) - see prepareMULTIpin //id label scannr feature1 ... featureN peptide proteinId1 .. proteinIdM void TopPerc::prepareCUSTOMpin(vector& peptide_ids, TextFile& txt, vector& user_param_features, char out_sep) @@ -1207,4 +1207,162 @@ feature abbreviation feature description } } + void TopPerc::prepareCONCATpin(vector >& peptide_id_list, vector >& protein_id_list, string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep) + { + //------------------------------------------------------------- + // header + //------------------------------------------------------------- + // Create String of the charges for the header of the tab file + stringstream ss; + ss << "Charge" << min_charge << ", "; + for (int j = min_charge+1; j <= max_charge; j++) + { + ss << "Charge" << j << ","; + } + + StringList ses_used; + for (vector >::iterator it = protein_id_list.begin(); it != protein_id_list.end(); ++it) + { + ses_used.push_back(it->front().getSearchEngine()); + } + + LOG_INFO << "Using " << ListUtils::concatenate(ses_used, ", ") << " as source for search engine specific features." << endl; + + String featureset = "id,label,ScanNr," + + ListUtils::concatenate(ses_used, ",") + "," + + ss.str() + + "Evalue,ionfrac,mass,enzN,enzC,enzInt,numHits,dM,absdM,PepLen,peptide,proteinId1"; + StringList txt_header = ListUtils::create(featureset); + // Insert the header with the features names to the file + txt.addLine(ListUtils::concatenate(txt_header, out_sep)); + + //------------------------------------------------------------- + // values + //------------------------------------------------------------- + // get all the feature values + for (vector >::iterator pit = peptide_id_list.begin(); pit != peptide_id_list.end(); ++pit) + { + size_t i = std::distance(peptide_id_list.begin(),pit); + String se = protein_id_list[i].front().getSearchEngine(); + for (vector::iterator it = pit->begin(); it != pit->end(); ++it) + { + it->sort(); + it->assignRanks(); + String scanidentifier = getScanIdentifier(it, pit->begin()); + StringList idents; + scanidentifier.split("=",idents); + String scannr = idents.back(); + std::vector hits = it->getHits(); + for (vector::iterator jt = hits.begin(); jt != hits.end(); ++jt) + { + int charge = jt->getCharge(); + int label = 1; + if (jt->metaValueExists("target_decoy") && String(jt->getMetaValue("target_decoy")).hasSubstring("decoy")) + { + label = -1; + } + + StringList sesp; + String ev; + for (StringList::iterator s = ses_used.begin(); s != ses_used.end(); ++s) + { + if ((*s) == se) + { + if (se == "MS-GF+") + { + sesp.push_back(jt->getMetaValue("MS:1002049")); // rawscore + ev = jt->getMetaValue("MS:1002053"); // evalue + } + if (se == "Mascot") + { + sesp.push_back(jt->getMetaValue("Mascot_score")); + ev = jt->getMetaValue("EValue"); + } + if (se == "Comet") + { + sesp.push_back(jt->getMetaValue("MS:1002252")); //xcorr + ev = jt->getMetaValue("MS:1002257"); //evalue + } + if (se == "XTandem") + { + sesp.push_back(jt->getMetaValue("XTandem_score")); + ev = jt->getMetaValue("E-Value"); + } + } + else + sesp.push_back("-1"); + } + + StringList chargen; + // write 1 for the correct charge, 0 for other charges + for (int i = min_charge; i <= max_charge; ++i) + { + if (charge != i) + { + chargen.push_back("0"); + } + else + { + chargen.push_back("1"); + } + } + + //IonFrac + String ionfrac = String(double(jt->getMetaValue("matched_intensity"))/double(jt->getMetaValue("sum_intensity"))); // also consider "matched_ion_number"/"peak_number" + //Mass + double mass = jt->getSequence().getMonoWeight(Residue::Full, charge)/charge; + //enzN + bool enzN = isEnz(jt->getPeptideEvidences().front().getAABefore(), jt->getSequence().getPrefix(1).toString().c_str()[0], enz); + //enzC + bool enzC = isEnz(jt->getSequence().getSuffix(1).toString().c_str()[0], jt->getPeptideEvidences().front().getAAAfter(), enz); + //enzInt + int enzInt = countEnzymatic(jt->getSequence().toUnmodifiedString(), enz); + //numHits + int numHits = jt->getScore(); + //dM + double dm = it->getMZ() - mass; + //absdM + double absdm = abs(dm); + //PepLen + int peplen = jt->getSequence().size(); + //peptide + String sequence = ""; + //replace flanking aa if [ or ] with - + char pb = jt->getPeptideEvidences().front().getAABefore(); + sequence += pb=='['?"-.":String(pb)+"."; // just first peptide evidence + sequence += jt->getSequence().toString(); + char pa = jt->getPeptideEvidences().front().getAAAfter(); + sequence += pa==']'?".-":"."+String(pa); // just first peptide evidence + //proteinId1 + StringList pepevid; + for (vector::const_iterator kt = jt->getPeptideEvidences().begin(); kt != jt->getPeptideEvidences().end(); ++kt) + { + pepevid.push_back(kt->getProteinAccession()); + } + + StringList row; + row.push_back(scanidentifier+"_"+String(std::distance(hits.begin(),jt))); + row.push_back(label); + row.push_back(scannr); + row.push_back(ListUtils::concatenate(sesp, out_sep)); + row.push_back(ListUtils::concatenate(chargen, out_sep)); + row.push_back(ev); + row.push_back(ionfrac); + row.push_back(String(mass)); + row.push_back(String(enzN)); + row.push_back(String(enzC)); + row.push_back(String(enzInt)); + row.push_back(String(numHits)); + row.push_back(String(dm)); + row.push_back(String(absdm)); + row.push_back(String(peplen)); + row.push_back(sequence); + row.push_back(ListUtils::concatenate(pepevid, out_sep)); + + txt.addLine(ListUtils::concatenate(row, out_sep)); + } + } + } + } + } diff --git a/src/utils/TopPerc.cpp b/src/utils/TopPerc.cpp index 74e831d1403..9da9db7d4cb 100644 --- a/src/utils/TopPerc.cpp +++ b/src/utils/TopPerc.cpp @@ -172,6 +172,7 @@ class TOPPPercolator : registerFlag_("Q", "Uses protein group level inference, each cluster of proteins is either present or not, therefore when grouping proteins discard all possible combinations for each group.(Only valid if option -A is active and -N is inactive).", true); registerFlag_("MHC", "Add a feature for MHC ligand properties to the specific PSM.", true); registerFlag_("same_search_db", "Manual override to ckeck if same settings for multiple search engines were applied.", true); + registerFlag_("concat", "Manual override to concatenate multiple search results instead of merging on scan level.", true); } ExitCodes main_(int, const char**) @@ -372,9 +373,17 @@ class TOPPPercolator : //ignore all but first input if NOT multiple for now if (se == "multiple") { - TopPerc::mergeMULTIids(protein_ids_list,peptide_ids_list, getFlag_("same_search_db")); // will collapse the list (reference) - LOG_DEBUG << "Merged to sizes " << protein_ids_list.size() << " and " << protein_ids_list.size() << endl; - TopPerc::prepareMULTIpin(peptide_ids_list.front(), protein_ids_list.front().front(), enz_str, txt, min_charge, max_charge); + if (getFlag_("concat")) + { + LOG_DEBUG << "Concatenating " << protein_ids_list.size() << " and " << peptide_ids_list.size() << endl; + TopPerc::prepareCONCATpin(peptide_ids_list, protein_ids_list, enz_str, txt, min_charge, max_charge); + } + else + { + TopPerc::mergeMULTIids(protein_ids_list,peptide_ids_list, getFlag_("same_search_db")); // will collapse the list (reference) + LOG_DEBUG << "Merged to sizes " << protein_ids_list.size() << " and " << peptide_ids_list.size() << endl; + TopPerc::prepareMULTIpin(peptide_ids_list.front(), protein_ids_list.front().front(), enz_str, txt, min_charge, max_charge); + } } //TODO introduce custom feature selection from TopPerc::prepareCUSTOMpin to parameters else if (se == "MS-GF+") TopPerc::prepareMSGFpin(peptide_ids_list.front(), enz_str, txt, min_charge, max_charge, getFlag_("MHC")); From 329876e4e7dfb85069df85d52593740d817e962a Mon Sep 17 00:00:00 2001 From: Matthew The Date: Tue, 12 Jul 2016 12:22:19 +0200 Subject: [PATCH 20/41] Successful build, used scan identifiers as SpecId for all search engines --- .../include/OpenMS/ANALYSIS/ID/TopPerc.h | 6 +- src/openms/source/ANALYSIS/ID/TopPerc.cpp | 131 +++++++------ src/utils/TopPerc.cpp | 181 +++++++++--------- 3 files changed, 167 insertions(+), 151 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h index 3563161508f..1bcd9026407 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h @@ -39,10 +39,12 @@ #include #include #include +#include #include #include #include +#include namespace OpenMS { @@ -116,9 +118,11 @@ namespace OpenMS static void prepareMASCOTpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep='\t'); static void prepareMULTIpin(std::vector& peptide_ids, ProteinIdentification& protein_id, std::string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep='\t'); static void prepareCONCATpin(std::vector >& peptide_id_list, std::vector >& protein_id_list, std::string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep='\t'); - static size_t countEnzymatic(String peptide, std::string& enz); + static void readPoutAsMap(String pout_file, std::map >& pep_map); + static Size countEnzymatic(String peptide, std::string& enz); static double rescaleFragmentFeature(double featureValue, int NumMatchedMainIons); static String getScanIdentifier(std::vector::iterator it, std::vector::iterator start); + static Int getScanNumber(String scan_identifier); static void assignDeltaScore(std::vector& hits, String score_ref); static void mergeMULTIids(std::vector >& protein_ids_list, std::vector >& peptide_ids_list, bool skip_checks=false); diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/TopPerc.cpp index 56200edecaf..01b36c5c958 100644 --- a/src/openms/source/ANALYSIS/ID/TopPerc.cpp +++ b/src/openms/source/ANALYSIS/ID/TopPerc.cpp @@ -55,8 +55,8 @@ namespace OpenMS { for (vector::const_iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) { - String exp_ref = it->getMetaValue("spectrum_reference").toString(); - String scannumber = getScanIdentifier(it, peptide_ids.begin()); + String scan_identifier = getScanIdentifier(it, peptide_ids.begin()); + Int scan_number = getScanNumber(scan_identifier); int label = 1; if (hit->metaValueExists("target_decoy") && String(hit->getMetaValue("target_decoy")).hasSubstring("decoy")) { @@ -64,9 +64,9 @@ namespace OpenMS } StringList collected_feats; - collected_feats.push_back(exp_ref); + collected_feats.push_back(scan_identifier); collected_feats.push_back(String(label)); - collected_feats.push_back(scannumber); + collected_feats.push_back(String(scan_number)); for (vector::const_iterator feat = user_param_features.begin(); feat != user_param_features.end(); ++feat) { @@ -123,22 +123,18 @@ namespace OpenMS if (hit->metaValueExists("NumMatchedMainIons")) { // only take features from first ranked entries and only with meanerrortop7 != 0.0 - if (hit->getRank() == 1 && hit->getMetaValue("MeanErrorTop7").toString().toDouble() != 0.0) + if (hit->getMetaValue("MeanErrorTop7").toString().toDouble() != 0.0) { - int rank = hit->getRank(); int charge = hit->getCharge(); - String scannumber = getScanIdentifier(it, peptide_ids.begin()); + String scan_identifier = getScanIdentifier(it, peptide_ids.begin()); + Int scan_number = getScanNumber(scan_identifier); int label = 1; - String SpecId = "target_SII_"; if ((String(hit->getMetaValue("target_decoy"))).hasSubstring("decoy")) { - SpecId = "decoy_SII_"; label = -1; } - SpecId += scannumber + "_" + String(rank) + "_" + String(charge); - double rawScore = hit->getMetaValue("MS:1002049").toString().toDouble(); double denovoScore = hit->getMetaValue("MS:1002050").toString().toDouble(); @@ -216,7 +212,7 @@ namespace OpenMS String protein = hit->getPeptideEvidences().front().getProteinAccession(); // One PeptideSpectrumHit with all its features - String lis = SpecId + out_sep + String(label) + out_sep + scannumber + out_sep + (String)rawScore + out_sep + + String lis = scan_identifier + out_sep + String(label) + out_sep + String(scan_number) + out_sep + (String)rawScore + out_sep + (String)denovoScore + out_sep + (String)scoreRatio + out_sep + (String)energy + out_sep + (String)ln_eval + out_sep + (String)isotopeError + out_sep + (String)lnExplainedIonCurrentRatio + out_sep + (String)lnNTermIonCurrentRatio + out_sep + (String)lnCTermIonCurrentRatio + out_sep + (String)lnMS2IonCurrent @@ -310,7 +306,8 @@ namespace OpenMS { if (it->isHigherScoreBetter()) { - String scannumber = getScanIdentifier(it, peptide_ids.begin()); + String scan_identifier = getScanIdentifier(it, peptide_ids.begin()); + Int scan_number = getScanNumber(scan_identifier); int charge = it->getHits().front().getCharge(); int label = 1; double hyperscore = it->getHits().front().getScore(); @@ -386,8 +383,8 @@ namespace OpenMS String protein = it->getHits().front().getPeptideEvidences().front().getProteinAccession(); // One PeptideSpectrumHit with all its features - String lis = "_tandem_output_file_target_" + scannumber + "_" + String(charge) + - "_1" + out_sep + String(label) + out_sep + scannumber + out_sep + String(hyperscore) + + String lis = scan_identifier + "_" + String(charge) + + "_1" + out_sep + String(label) + out_sep + String(scan_number) + out_sep + String(hyperscore) + out_sep + String(deltascore) + out_sep + ss_ion_2.str() + String(mh) + out_sep + String(dm) + out_sep + String(absdM) + out_sep + String(length) + out_sep + String(ss.str()) + String(enzN) + out_sep + String(enzC) + out_sep + String(enzInt) + out_sep + peptide + out_sep + protein; @@ -403,7 +400,8 @@ namespace OpenMS { if (it->isHigherScoreBetter()) { - String scannumber = String(it->getMetaValue("spectrum_reference")); + String scan_identifier = String(it->getMetaValue("spectrum_reference")); + Int scan_number = getScanNumber(scan_identifier); int charge = it->getHits().front().getCharge(); int label = -1; double hyperscore = it->getHits().front().getScore(); @@ -473,7 +471,7 @@ namespace OpenMS String protein = it->getHits().front().getPeptideEvidences().front().getProteinAccession(); // One PeptideSpectrumHit with all its features - String lis = "_tandem_output_file_decoy_" + scannumber + "_" + String(charge) + "_1" + out_sep + String(label) + out_sep + scannumber + out_sep + String(hyperscore) + out_sep + String(deltascore) + out_sep + ss_ion_2.str() + out_sep + String lis = scan_identifier + "_" + String(charge) + "_1" + out_sep + String(label) + out_sep + String(scan_number) + out_sep + String(hyperscore) + out_sep + String(deltascore) + out_sep + ss_ion_2.str() + out_sep + String(mh) + out_sep + String(dm) + out_sep + String(absdM) + out_sep + String(length) + out_sep + ss.str() + out_sep + String(enzN) + out_sep + String(enzC) + out_sep + String(enzInt) + out_sep + peptide + out_sep + protein; // peptide Spectrum Hit pushed to the output file @@ -517,15 +515,11 @@ id label ScanNr lnrSp deltLCn deltCn lnExpect Xcorr Sp IonFrac Mass PepLen Charg } it->sort(); it->assignRanks(); - String scannumber = getScanIdentifier(it, peptide_ids.begin()); + String scan_identifier = getScanIdentifier(it, peptide_ids.begin()); + Int scan_number = getScanNumber(scan_identifier); std::vector hits = it->getHits(); for (vector::iterator jt = hits.begin(); jt != hits.end(); ++jt) { - StringList idents; - idents.push_back(it->getBaseName()); - idents.push_back(scannumber); - idents.push_back(String(jt->getRank())); - String sid = ListUtils::concatenate(idents, "_"); int charge = jt->getCharge(); int label = 1; if (jt->metaValueExists("target_decoy") && String(jt->getMetaValue("target_decoy")).hasSubstring("decoy")) @@ -591,9 +585,9 @@ id label ScanNr lnrSp deltLCn deltCn lnExpect Xcorr Sp IonFrac Mass PepLen Charg } StringList row; - row.push_back(sid); + row.push_back(scan_identifier); row.push_back(label); - row.push_back(scannumber); + row.push_back(String(scan_number)); row.push_back(lnrSp); row.push_back(deltaLCn); row.push_back(deltaCn); @@ -668,7 +662,8 @@ feature abbreviation feature description } it->sort(); it->assignRanks(); - String scannumber = getScanIdentifier(it, peptide_ids.begin()); + String scan_identifier = getScanIdentifier(it, peptide_ids.begin()); + Int scan_number = getScanNumber(scan_identifier); it->sort(); it->assignRanks(); @@ -676,11 +671,6 @@ feature abbreviation feature description assignDeltaScore(hits, "MS:1001171"); for (vector::iterator jt = hits.begin(); jt != hits.end(); ++jt) { - StringList idents; - idents.push_back(it->getBaseName()); - idents.push_back(scannumber); - idents.push_back(String(jt->getRank())); - String sid = ListUtils::concatenate(idents, "_"); int label = 1; if (jt->metaValueExists("target_decoy") && String(jt->getMetaValue("target_decoy")).hasSubstring("decoy")) { @@ -741,9 +731,9 @@ feature abbreviation feature description } StringList row; - row.push_back(sid); + row.push_back(scan_identifier); row.push_back(label); - row.push_back(scannumber); + row.push_back(String(scan_number)); row.push_back(String(mass)); row.push_back(ListUtils::concatenate(chargen, out_sep)); row.push_back(String(mScore)); @@ -826,10 +816,10 @@ feature abbreviation feature description } // Function adapted from Enzyme.h in Percolator converter - size_t TopPerc::countEnzymatic(String peptide, string& enz) + Size TopPerc::countEnzymatic(String peptide, string& enz) { - size_t count = 0; - for (size_t ix = 1; ix < peptide.size(); ++ix) + Size count = 0; + for (Size ix = 1; ix < peptide.size(); ++ix) { if (isEnz(peptide[ix - 1], peptide[ix], enz)) { @@ -851,17 +841,31 @@ feature abbreviation feature description String TopPerc::getScanIdentifier(vector::iterator it, vector::iterator start) { - String scannumber = it->getMetaValue("spectrum_reference"); - if (scannumber.empty()) + String scan_identifier = it->getMetaValue("spectrum_reference"); + if (scan_identifier.empty()) { - scannumber = String(it->getMetaValue("spectrum_id")); - if (scannumber.empty()) + scan_identifier = String(it->getMetaValue("spectrum_id")); + if (scan_identifier.empty()) { - scannumber = String(it - start + 1); + scan_identifier = String(it - start + 1); LOG_WARN << "no known spectrum identifiers, using index [1,n] - use at own risk." << endl; } } - return scannumber.removeWhitespaces(); + return scan_identifier.removeWhitespaces(); + } + + Int TopPerc::getScanNumber(String scan_identifier) + { + Size idx = 0; + if ((idx = scan_identifier.find("index=")) != std::string::npos) + { + scan_identifier = scan_identifier.substr(idx + 6); + } + else if ((idx = scan_identifier.find("scan=")) != std::string::npos) + { + scan_identifier = scan_identifier.substr(idx + 5); + } + return scan_identifier.toInt(); } void TopPerc::assignDeltaScore(vector& hits, String score_ref) @@ -892,7 +896,7 @@ feature abbreviation feature description //search parameters of all runs must correspond (considering front() of each only) if (!skip_checks) { - for (size_t i=1; i < protein_ids_list.size(); ++i) + for (Size i=1; i < protein_ids_list.size(); ++i) { if( protein_ids_list[i-1].front().getSearchParameters().db != protein_ids_list[i].front().getSearchParameters().db ) { @@ -922,7 +926,7 @@ feature abbreviation feature description //set score in each hit to #SE hits hit->setScore(1); //rename common meta values (to SE:commonmetavaluename) - for (size_t i = 0; i < commonMetaValues.size(); ++i) + for (Size i = 0; i < commonMetaValues.size(); ++i) { if (hit->metaValueExists(commonMetaValues[i])) { @@ -1114,10 +1118,9 @@ feature abbreviation feature description { it->sort(); it->assignRanks(); - String scanidentifier = getScanIdentifier(it, peptide_ids.begin()); + String scan_identifier = getScanIdentifier(it, peptide_ids.begin()); + Int scan_number = getScanNumber(scan_identifier); StringList idents; - scanidentifier.split("=",idents); - String scannr = idents.back(); std::vector hits = it->getHits(); for (vector::iterator jt = hits.begin(); jt != hits.end(); ++jt) { @@ -1185,9 +1188,9 @@ feature abbreviation feature description } StringList row; - row.push_back(scanidentifier+"_"+String(std::distance(hits.begin(),jt))); + row.push_back(scan_identifier); row.push_back(label); - row.push_back(scannr); + row.push_back(String(scan_number)); row.push_back(ListUtils::concatenate(sesp, out_sep)); row.push_back(ListUtils::concatenate(chargen, out_sep)); row.push_back(ionfrac); @@ -1242,16 +1245,14 @@ feature abbreviation feature description // get all the feature values for (vector >::iterator pit = peptide_id_list.begin(); pit != peptide_id_list.end(); ++pit) { - size_t i = std::distance(peptide_id_list.begin(),pit); + Size i = std::distance(peptide_id_list.begin(),pit); String se = protein_id_list[i].front().getSearchEngine(); for (vector::iterator it = pit->begin(); it != pit->end(); ++it) { it->sort(); it->assignRanks(); - String scanidentifier = getScanIdentifier(it, pit->begin()); - StringList idents; - scanidentifier.split("=",idents); - String scannr = idents.back(); + String scan_identifier = getScanIdentifier(it, pit->begin()); + Int scan_number = getScanNumber(scan_identifier); std::vector hits = it->getHits(); for (vector::iterator jt = hits.begin(); jt != hits.end(); ++jt) { @@ -1341,9 +1342,9 @@ feature abbreviation feature description } StringList row; - row.push_back(scanidentifier+"_"+String(std::distance(hits.begin(),jt))); + row.push_back(scan_identifier); row.push_back(label); - row.push_back(scannr); + row.push_back(String(scan_number)); row.push_back(ListUtils::concatenate(sesp, out_sep)); row.push_back(ListUtils::concatenate(chargen, out_sep)); row.push_back(ev); @@ -1364,5 +1365,23 @@ feature abbreviation feature description } } } + + void TopPerc::readPoutAsMap(String pout_file, map >& pep_map) + { + CsvFile csv_file(pout_file, '\t'); + StringList row; + + for (Size i = 1; i < csv_file.rowCount(); ++i) + { + csv_file.getRow(i, row); + PercolatorResult res(row); + String spec_ref = res.PSMId; + if (pep_map.find(spec_ref) == pep_map.end()) + { + pep_map[spec_ref] = vector(); + } + pep_map[spec_ref].push_back(res); + } + } } diff --git a/src/utils/TopPerc.cpp b/src/utils/TopPerc.cpp index 9da9db7d4cb..36bb79295c1 100644 --- a/src/utils/TopPerc.cpp +++ b/src/utils/TopPerc.cpp @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include @@ -110,9 +109,10 @@ class TOPPPercolator : { registerInputFileList_("in", "", StringList(), "Input file(s)", true); setValidFormats_("in", ListUtils::create("mzid,idXML")); - registerInputFile_("in_decoy", "", "", "Input decoy file", false); - setValidFormats_("in_decoy", ListUtils::create("mzid")); - registerOutputFile_("out", "", "", "Output file", true); + registerInputFileList_("in_decoy", "", StringList(), "Input decoy file(s)", false); + setValidFormats_("in_decoy", ListUtils::create("mzid,idXML")); + registerOutputFile_("out", "", "", "Output file in idXML format", true); + registerOutputFile_("mzid_out", "", "", "Output file in mzid format", true); std::string enzs = "no_enzyme,elastase,pepsin,proteinasek,thermolysin,chymotrypsin,lys-n,lys-c,arg-c,asp-n,glu-c,trypsin"; registerStringOption_("enzyme", "", "trypsin", "Type of enzyme: "+enzs , false); setValidStrings_("enzyme", ListUtils::create(enzs)); @@ -127,19 +127,20 @@ class TOPPPercolator : ); //Advanced parameters -// //registerOutputFile_("r", "", "out", "Output tab delimited results to a file instead of stdout", false, true); - registerOutputFile_("X", "", "", "path to file in xml-output format (pout). Default is: pout.tab", false, true); - registerFlag_("e", "read xml-input format (pin) from standard input", true); - registerFlag_("Z", "Include decoys (PSMs, peptides and/or proteins) in the xml-output. Only available if -X is used.", true); + //registerOutputFile_("r", "", "out", "Output tab delimited results to a file instead of stdout", false, true); + //registerOutputFile_("B", "", "", "Output tab delimited results for decoys into a file", false, true); + //registerOutputFile_("X", "", "", "path to file in xml-output format (pout). Default is: pout.tab", false, true); + //registerFlag_("e", "read xml-input format (pin) from standard input", true); + //registerFlag_("Z", "Include decoys (PSMs, peptides and/or proteins) in the xml-output. Only available if -X is used.", true); registerDoubleOption_("p", "", 0.0, "Cpos, penalty for mistakes made on positive examples. Set by cross validation if not specified.", false, true); registerDoubleOption_("n", "", 0.0, "Cneg, penalty for mistakes made on negative examples. Set by cross validation if not specified.", false, true); registerDoubleOption_("F", "", 0.01, "False discovery rate threshold to define positive examples in training. Set by cross validation if 0. Default is 0.01.", false, true); registerDoubleOption_("t", "", 0.01, "False discovery rate threshold for evaluating best cross validation result and the reported end result. Default is 0.01.", false, true); registerIntOption_("i", "", 0, "Maximal number of iterations", false, true); registerFlag_("x", "Quicker execution by reduced internal cross-validation.", true); - registerDoubleOption_("f", "", 0.6, "Fraction of the negative data set to be used as train set when only providing one negative set, remaining examples will be used as test set. Set to 0.6 by default.", false, true); - registerOutputFile_("J", "", "", "Output the computed features to the given file in tab-delimited format. A file with the features with the given file name will be created", false, true); - registerInputFile_("k", "", "", "Input file given in the deprecated pin-xml format generated by e.g. sqt2pin with the -k option", false, true); + //registerDoubleOption_("f", "", 0.6, "Fraction of the negative data set to be used as train set when only providing one negative set, remaining examples will be used as test set. Set to 0.6 by default.", false, true); + //registerOutputFile_("J", "", "", "Output the computed features to the given file in tab-delimited format. A file with the features with the given file name will be created", false, true); + //registerInputFile_("k", "", "", "Input file given in the deprecated pin-xml format generated by e.g. sqt2pin with the -k option", false, true); registerOutputFile_("w", "", "", "Output final weights to the given file", false, true); registerInputFile_("W", "", "", "Read initial weights to the given file", false, true); registerStringOption_("V", "", "", "The most informative feature given as the feature name, can be negated to indicate that a lower value is better.", false, true); @@ -150,26 +151,25 @@ class TOPPPercolator : registerIntOption_("S", "", 1, "Setting seed of the random number generator. Default value is 1", false, true); registerFlag_("K", "Retention time features calculated as in Klammer et al.", true); registerFlag_("D", "Include description of correct features", true); - registerOutputFile_("B", "", "", "Output tab delimited results for decoys into a file", false, true); registerFlag_("U", "Do not remove redundant peptides, keep all PSMS and exclude peptide level probabilities.", true); - registerFlag_("s", "skip validation of input file against xml schema", true); - registerFlag_("A", "output protein level probabilities", true); - registerDoubleOption_("a", "", 0.0, "Probability with which a present protein emits an associated peptide (to be used jointly with the -A option). Set by grid search if not specified.", false, true); - registerDoubleOption_("b", "", 0.0, "Probability of the creation of a peptide from noise (to be used jointly with the -A option). Set by grid search if not specified", false, true); - registerDoubleOption_("G", "", 0.0, "Prior probability of that a protein is present in the sample ( to be used with the -A option). Set by grid search if not specified", false, true); - registerFlag_("g", "treat ties as if it were one protein (Only valid if option -A is active).", true); - registerFlag_("I", "use pi_0 value when calculating empirical q-values (no effect if option Q is activated) (Only valid if option -A is active).", true); - registerFlag_("q", "output empirical q-values and p-values (from target-decoy analysis) (Only valid if option -A is active).", true); - registerFlag_("N", "disactivates the grouping of proteins with similar connectivity, for example if proteins P1 and P2 have the same peptides matching both of them, P1 and P2 will not be grouped as one protein (Only valid if option -A is active).", true); - registerFlag_("E", "Proteins graph will not be separated in sub-graphs (Only valid if option -A is active).", true); - registerFlag_("C", "it does not prune peptides with a very low score (~0.0) which means that if a peptide with a very low score is matching two proteins, when we prune the peptide,it will be duplicated to generate two new protein groups (Only valid if option -A is active).", true); - registerIntOption_("d", "", 0, "Setting depth 0 or 1 or 2 from low depth to high depth(less computational time) of the grid search for the estimation Alpha,Beta and Gamma parameters for fido(Only valid if option -A is active). Default value is 0", false, true); + //registerFlag_("s", "skip validation of input file against xml schema", true); + //registerFlag_("A", "output protein level probabilities", true); + //registerDoubleOption_("a", "", 0.0, "Probability with which a present protein emits an associated peptide (to be used jointly with the -A option). Set by grid search if not specified.", false, true); + //registerDoubleOption_("b", "", 0.0, "Probability of the creation of a peptide from noise (to be used jointly with the -A option). Set by grid search if not specified", false, true); + //registerDoubleOption_("G", "", 0.0, "Prior probability of that a protein is present in the sample ( to be used with the -A option). Set by grid search if not specified", false, true); + //registerFlag_("g", "treat ties as if it were one protein (Only valid if option -A is active).", true); + //registerFlag_("I", "use pi_0 value when calculating empirical q-values (no effect if option Q is activated) (Only valid if option -A is active).", true); + //registerFlag_("q", "output empirical q-values and p-values (from target-decoy analysis) (Only valid if option -A is active).", true); + //registerFlag_("N", "disactivates the grouping of proteins with similar connectivity, for example if proteins P1 and P2 have the same peptides matching both of them, P1 and P2 will not be grouped as one protein (Only valid if option -A is active).", true); + //registerFlag_("E", "Proteins graph will not be separated in sub-graphs (Only valid if option -A is active).", true); + //registerFlag_("C", "it does not prune peptides with a very low score (~0.0) which means that if a peptide with a very low score is matching two proteins, when we prune the peptide,it will be duplicated to generate two new protein groups (Only valid if option -A is active).", true); + //registerIntOption_("d", "", 0, "Setting depth 0 or 1 or 2 from low depth to high depth(less computational time) of the grid search for the estimation Alpha,Beta and Gamma parameters for fido(Only valid if option -A is active). Default value is 0", false, true); registerStringOption_("P", "", "random", "Define the text pattern to identify the decoy proteins and/or PSMs, set this up if the label that identifies the decoys in the database is not the default (by default : random) (Only valid if option -A is active).", false, true); - registerFlag_("T", "Reduce the tree of proteins (removing low scored proteins) in order to estimate alpha,beta and gamma faster.(Only valid if option -A is active).", true); + //registerFlag_("T", "Reduce the tree of proteins (removing low scored proteins) in order to estimate alpha,beta and gamma faster.(Only valid if option -A is active).", true); registerFlag_("Y", "Use target decoy competition to compute peptide probabilities.(recommended when using -A).", true); - registerFlag_("H", "Q-value threshold that will be used in the computation of the MSE and ROC AUC score in the grid search (recommended 0.05 for normal size datasets and 0.1 for big size datasets).(Only valid if option -A is active).", true); - registerFlag_("fido-truncation", "Proteins with a very low score (< 0.001) will be truncated (assigned 0.0 probability).(Only valid if option -A is active)", true); - registerFlag_("Q", "Uses protein group level inference, each cluster of proteins is either present or not, therefore when grouping proteins discard all possible combinations for each group.(Only valid if option -A is active and -N is inactive).", true); + //registerFlag_("H", "Q-value threshold that will be used in the computation of the MSE and ROC AUC score in the grid search (recommended 0.05 for normal size datasets and 0.1 for big size datasets).(Only valid if option -A is active).", true); + //registerFlag_("fido-truncation", "Proteins with a very low score (< 0.001) will be truncated (assigned 0.0 probability).(Only valid if option -A is active)", true); + //registerFlag_("Q", "Uses protein group level inference, each cluster of proteins is either present or not, therefore when grouping proteins discard all possible combinations for each group.(Only valid if option -A is active and -N is inactive).", true); registerFlag_("MHC", "Add a feature for MHC ligand properties to the specific PSM.", true); registerFlag_("same_search_db", "Manual override to ckeck if same settings for multiple search engines were applied.", true); registerFlag_("concat", "Manual override to concatenate multiple search results instead of merging on scan level.", true); @@ -187,8 +187,8 @@ class TOPPPercolator : // parsing parameters //------------------------------------------------------------- const StringList in_list = getStringList_("in"); - const String in_decoy = getStringOption_("in_decoy"); - LOG_DEBUG << "Input file (of target?): " << ListUtils::concatenate(in_list, ",") << " & " << in_decoy << " (decoy)" << endl; + const StringList in_decoy = getStringList_("in_decoy"); + LOG_DEBUG << "Input file (of target?): " << ListUtils::concatenate(in_list, ",") << " & " << ListUtils::concatenate(in_decoy, ",") << " (decoy)" << endl; const String percolator_executable(getStringOption_("percolator_executable")); writeDebug_(String("Path to the percolator: ") + percolator_executable, 2); @@ -198,6 +198,15 @@ class TOPPPercolator : printUsage_(); return ILLEGAL_PARAMETERS; } + + const String mzid_out(getStringOption_("mzid_out")); + const String out(getStringOption_("out")); + if (mzid_out.empty() && out.empty()) + { + writeLog_("Fatal error: no output file given (parameter 'out' or 'mzid_out')"); + printUsage_(); + return ILLEGAL_PARAMETERS; + } //------------------------------------------------------------- // read input @@ -260,15 +269,15 @@ class TOPPPercolator : vector decoy_peptide_ids; vector decoy_protein_ids; FileHandler fh; - FileTypes::Type in_decoy_type = fh.getType(in_decoy); + FileTypes::Type in_decoy_type = fh.getType(in_decoy.front()); if (in_decoy_type == FileTypes::IDXML) { - IdXMLFile().load(in_decoy, decoy_protein_ids, decoy_peptide_ids); + IdXMLFile().load(in_decoy.front(), decoy_protein_ids, decoy_peptide_ids); } else if (in_decoy_type == FileTypes::MZIDENTML) { LOG_WARN << "Converting from mzid: you might experience loss of information depending on the capabilities of the target format." << endl; - MzIdentMLFile().load(in_decoy, decoy_protein_ids, decoy_peptide_ids); + MzIdentMLFile().load(in_decoy.front(), decoy_protein_ids, decoy_peptide_ids); } //paranoia check if this comes from the same search engine! (only in the first proteinidentification of the first proteinidentifications vector vector) @@ -403,24 +412,28 @@ class TOPPPercolator : } String txt_designator = File::getUniqueName(); String pin_file(temp_directory_body + txt_designator + "_pin.tab"); - String pout_file(temp_directory_body + txt_designator + "_pout.tab"); + String pout_target_file(temp_directory_body + txt_designator + "_target_pout.tab"); + String pout_decoy_file(temp_directory_body + txt_designator + "_decoy_pout.tab"); txt.store(pin_file); QStringList arguments; // Check all set parameters and get them into arguments StringList { - arguments << "-r" << pout_file.toQString(); - if (getFlag_("e")) arguments << "-e"; - if (getFlag_("Z")) arguments << "-Z"; + arguments << "-U"; + arguments << "-m" << pout_target_file.toQString(); + arguments << "-M" << pout_decoy_file.toQString(); + //if (getFlag_("U")) arguments << "-U"; + //if (getFlag_("e")) arguments << "-e"; + //if (getFlag_("Z")) arguments << "-Z"; if (getDoubleOption_("p") != 0.0) arguments << "-p" << String(getDoubleOption_("p")).toQString(); if (getDoubleOption_("n") != 0.0) arguments << "-n" << String(getDoubleOption_("n")).toQString(); if (getDoubleOption_("F") != 0.01) arguments << "-F" << String(getDoubleOption_("F")).toQString(); if (getDoubleOption_("t") != 0.01) arguments << "-t" << String(getDoubleOption_("t")).toQString(); if (getIntOption_("i") != 0) arguments << "-i" << String(getIntOption_("i")).toQString(); if (getFlag_("x")) arguments << "-x"; - if (getDoubleOption_("f") != 0.6) arguments << "-f" << String(getDoubleOption_("f")).toQString(); - if (getStringOption_("J") != "") arguments << "-J" << getStringOption_("J").toQString(); - if (getStringOption_("k") != "") arguments << "-k" << getStringOption_("k").toQString(); + //if (getDoubleOption_("f") != 0.6) arguments << "-f" << String(getDoubleOption_("f")).toQString(); + //if (getStringOption_("J") != "") arguments << "-J" << getStringOption_("J").toQString(); + //if (getStringOption_("k") != "") arguments << "-k" << getStringOption_("k").toQString(); if (getStringOption_("w") != "") arguments << "-w" << getStringOption_("w").toQString(); if (getStringOption_("W") != "") arguments << "-W" << getStringOption_("W").toQString(); if (getStringOption_("V") != "") arguments << "-V" << getStringOption_("V").toQString(); @@ -431,27 +444,24 @@ class TOPPPercolator : if (getIntOption_("S") != 1) arguments << "-S" << String(getDoubleOption_("S")).toQString(); if (getFlag_("K")) arguments << "-K"; if (getFlag_("D")) arguments << "-D"; - if (getStringOption_("B") != "") arguments << "-B" << getStringOption_("B").toQString(); - if (getFlag_("U")) arguments << "-U"; - if (getFlag_("s")) arguments << "-s"; - if (getFlag_("A")) arguments << "-A"; - if (getDoubleOption_("a") != 0.0) arguments << "-a" << String(getDoubleOption_("a")).toQString(); - if (getDoubleOption_("b") != 0.0) arguments << "-b" << String(getDoubleOption_("b")).toQString(); - if (getDoubleOption_("G") != 0.0) arguments << "-G" << String(getDoubleOption_("G")).toQString(); - if (getFlag_("g")) arguments << "-g"; - if (getFlag_("I")) arguments << "-I"; - if (getFlag_("q")) arguments << "-q"; - if (getFlag_("N")) arguments << "-N"; - if (getFlag_("E")) arguments << "-E"; - if (getFlag_("C")) arguments << "-C"; - if (getIntOption_("d") != 0) arguments << "-d" << String(getIntOption_("d")).toQString(); + //if (getFlag_("s")) arguments << "-s"; + //if (getFlag_("A")) arguments << "-A"; + //if (getDoubleOption_("a") != 0.0) arguments << "-a" << String(getDoubleOption_("a")).toQString(); + //if (getDoubleOption_("b") != 0.0) arguments << "-b" << String(getDoubleOption_("b")).toQString(); + //if (getDoubleOption_("G") != 0.0) arguments << "-G" << String(getDoubleOption_("G")).toQString(); + //if (getFlag_("g")) arguments << "-g"; + //if (getFlag_("I")) arguments << "-I"; + //if (getFlag_("q")) arguments << "-q"; + //if (getFlag_("N")) arguments << "-N"; + //if (getFlag_("E")) arguments << "-E"; + //if (getFlag_("C")) arguments << "-C"; + //if (getIntOption_("d") != 0) arguments << "-d" << String(getIntOption_("d")).toQString(); if (getStringOption_("P") != "random") arguments << "-P" << getStringOption_("P").toQString(); - if (getFlag_("T")) arguments << "-T"; + //if (getFlag_("T")) arguments << "-T"; if (getFlag_("Y")) arguments << "-Y"; - if (getFlag_("H")) arguments << "-H"; - if (getFlag_("fido-truncation")) arguments << "--fido-truncation"; - if (getFlag_("Q")) arguments << "-Q"; - arguments << "-U"; + //if (getFlag_("H")) arguments << "-H"; + //if (getFlag_("fido-truncation")) arguments << "--fido-truncation"; + //if (getFlag_("Q")) arguments << "-Q"; arguments << pin_file.toQString(); } writeLog_("Prepared percolator input."); @@ -484,25 +494,9 @@ class TOPPPercolator : //------------------------------------------------------------- // when percolator finished calculation, it stores the results -r option (with or without -U) or -m (which seems to be not working) // WARNING: The -r option cannot be used in conjunction with -U: no peptide level statistics are calculated, redirecting PSM level statistics to provided file instead. - - CsvFile csv_file(pout_file, '\t'); - map > pep_map; - StringList row; - - for (size_t i = 1; i < csv_file.rowCount(); ++i) - { - csv_file.getRow(i, row); - TopPerc::PercolatorResult res(row); - StringList spl; - res.PSMId.split("_",spl); - String spec_ref = spl.front(); - if (pep_map.find(spec_ref) == pep_map.end()) - { - pep_map[spec_ref] = vector(); - } - pep_map[spec_ref].push_back(res); - } + TopPerc::readPoutAsMap(pout_target_file, pep_map); + TopPerc::readPoutAsMap(pout_decoy_file, pep_map); // As the percolator output file is not needed anymore, the temporary directory is going to be deleted if (this->debug_level_ < 99) @@ -511,7 +505,7 @@ class TOPPPercolator : } else { - LOG_WARN << "Keeping the temporary files at '" << temp_directory_body << "'. Set debug level to <2 to remove them." << std::endl; + LOG_WARN << "Keeping the temporary files at '" << temp_directory_body << "'. Set debug level to <99 to remove them." << std::endl; } // Add the percolator results to the peptide vector of the original input file @@ -519,27 +513,19 @@ class TOPPPercolator : size_t cnt = 0; for (vector::iterator it = peptide_ids_list.front().begin(); it != peptide_ids_list.front().end(); ++it) { - String sid = it->getMetaValue("spectrum_reference"); - sid = sid.removeWhitespaces(); - if (pep_map.find(sid) == pep_map.end()) + String scan_identifier = TopPerc::getScanIdentifier(it, peptide_ids_list.front().begin()); + if (pep_map.find(scan_identifier) == pep_map.end()) { - String sid_ = sid; - vector sr; - sid.split('=', sr); - sid = sr.back(); - if (pep_map.find(sid) == pep_map.end()) - { - ++c_debug; - LOG_DEBUG << "No suitable PeptideIdentification entry found for .pout entry " << sid << " or " << sid_ << endl; - continue; - } + ++c_debug; + LOG_DEBUG << "No suitable PeptideIdentification entry found for .pout entry " << scan_identifier << endl; + continue; } //check each PeptideHit for compliance with one of the PercolatorResults (by sequence) for (vector::iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) { String pis = hit->getSequence().toUnmodifiedString(); - for (vector::iterator pr = pep_map.find(sid)->second.begin(); pr != pep_map.find(sid)->second.end(); ++pr) + for (vector::iterator pr = pep_map.find(scan_identifier)->second.begin(); pr != pep_map.find(scan_identifier)->second.end(); ++pr) { if (pis == pr->peptide && pr->preAA == hit->getPeptideEvidences().front().getAABefore() && @@ -569,9 +555,16 @@ class TOPPPercolator : //TODO write all percolator parameters as set here in sp it->setSearchParameters(sp); } - + // Storing the PeptideHits with calculated q-value, pep and svm score - MzIdentMLFile().store(getStringOption_("out").toQString().toStdString(), protein_ids_list.front(), peptide_ids_list.front()); + if (!mzid_out.empty()) + { + MzIdentMLFile().store(mzid_out.toQString().toStdString(), protein_ids_list.front(), peptide_ids_list.front()); + } + if (!out.empty()) + { + IdXMLFile().store(out.toQString().toStdString(), protein_ids_list.front(), peptide_ids_list.front()); + } writeLog_("TopPerc finished successfully!"); return EXECUTION_OK; From 1b78721b25ec23f0663e921446dc72adfcf80997 Mon Sep 17 00:00:00 2001 From: Matthew The Date: Wed, 13 Jul 2016 08:58:20 +0200 Subject: [PATCH 21/41] PercolatorAdapter working --- cmake/knime_package_support.cmake | 8 + .../include/OpenMS/ANALYSIS/ID/TopPerc.h | 1 + src/openms/source/ANALYSIS/ID/TopPerc.cpp | 88 ++- .../source/APPLICATIONS/ToolHandler.cpp | 1 + src/topp/PercolatorAdapter.cpp | 660 ++++++++++++++++++ src/topp/executables.cmake | 1 + 6 files changed, 758 insertions(+), 1 deletion(-) create mode 100644 src/topp/PercolatorAdapter.cpp diff --git a/cmake/knime_package_support.cmake b/cmake/knime_package_support.cmake index bddca635496..b9d1e1085e6 100644 --- a/cmake/knime_package_support.cmake +++ b/cmake/knime_package_support.cmake @@ -118,6 +118,8 @@ add_custom_target( COMMAND ${CMAKE_COMMAND} -D SCRIPT_DIR=${SCRIPT_DIRECTORY} -DTOOLNAME=MSGFPlusAdapter -DPARAM=executable -D CTD_PATH=${CTD_PATH} -P ${SCRIPT_DIRECTORY}remove_parameter_from_ctd.cmake # LuciPhorAdapter COMMAND ${CMAKE_COMMAND} -D SCRIPT_DIR=${SCRIPT_DIRECTORY} -DTOOLNAME=LuciphorAdapter -DPARAM=executable -D CTD_PATH=${CTD_PATH} -P ${SCRIPT_DIRECTORY}remove_parameter_from_ctd.cmake + # PercolatorAdapter + COMMAND ${CMAKE_COMMAND} -D SCRIPT_DIR=${SCRIPT_DIRECTORY} -DTOOLNAME=PercolatorAdapter -DPARAM=executable -D CTD_PATH=${CTD_PATH} -P ${SCRIPT_DIRECTORY}remove_parameter_from_ctd.cmake # FidoAdapter COMMAND ${CMAKE_COMMAND} -D SCRIPT_DIR=${SCRIPT_DIRECTORY} -DTOOLNAME=FidoAdapter -DPARAM=fido_executable -D CTD_PATH=${CTD_PATH} -P ${SCRIPT_DIRECTORY}remove_parameter_from_ctd.cmake COMMAND ${CMAKE_COMMAND} -D SCRIPT_DIR=${SCRIPT_DIRECTORY} -DTOOLNAME=FidoAdapter -DPARAM=fidocp_executable -D CTD_PATH=${CTD_PATH} -P ${SCRIPT_DIRECTORY}remove_parameter_from_ctd.cmake @@ -262,9 +264,15 @@ if(NOT EXISTS ${SEARCH_ENGINES_DIRECTORY}) elseif(NOT EXISTS ${SEARCH_ENGINES_DIRECTORY}/OMSSA OR NOT EXISTS ${SEARCH_ENGINES_DIRECTORY}/XTandem OR NOT EXISTS ${SEARCH_ENGINES_DIRECTORY}/MSGFPlus) message(FATAL_ERROR "The given search engine directory seems to have an invalid layout. ${FOLDER_STRUCTURE_MESSAGE}") elseif(NOT EXISTS ${SEARCH_ENGINES_DIRECTORY}/Fido) +<<<<<<< HEAD message(FATAL_ERROR "The given search engine directory seems to have an invalid layout (Fido is missing). ${FOLDER_STRUCTURE_MESSAGE}") elseif(NOT EXISTS ${SEARCH_ENGINES_DIRECTORY}/LuciPHOr2) message(FATAL_ERROR "The given search engine directory seems to have an invalid layout (LuciPHOr2 is missing). ${FOLDER_STRUCTURE_MESSAGE}") +======= + message(FATAL_ERROR "The given search engine directory seems to have an invalid layout (Fido is missing). Please check use the one from the SVN.") +elseif(NOT EXISTS ${SEARCH_ENGINES_DIRECTORY}/Percolator) + message(FATAL_ERROR "The given search engine directory seems to have an invalid layout (Percolator is missing). Please check use the one from the SVN.") +>>>>>>> PercolatorAdapter working elseif(NOT APPLE AND NOT EXISTS ${SEARCH_ENGINES_DIRECTORY}/MyriMatch) message(FATAL_ERROR "The given search engine directory seems to have an invalid layout (MyriMatch is missing). ${FOLDER_STRUCTURE_MESSAGE}") endif() diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h index 1bcd9026407..b427e49ffd3 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h @@ -111,6 +111,7 @@ namespace OpenMS public: static bool isEnz(const char& n, const char& c, std::string& enz); + static void preparePin(std::vector& peptide_ids, StringList& feature_set, std::string& enz, TextFile& txt, int min_charge, int max_charge, const char out_sep='\t'); static void prepareCUSTOMpin(std::vector& peptide_ids, TextFile& txt, std::vector& user_param_features, char out_sep='\t'); static void prepareMSGFpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int min_charge, int max_charge, bool addMHC = false, char out_sep='\t'); static void prepareXTANDEMpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep='\t'); diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/TopPerc.cpp index 01b36c5c958..5ce08d5e227 100644 --- a/src/openms/source/ANALYSIS/ID/TopPerc.cpp +++ b/src/openms/source/ANALYSIS/ID/TopPerc.cpp @@ -42,7 +42,93 @@ namespace OpenMS //TODO for all prepare* PSMId as written in PeptideIdentification::spectrum_reference // and pre/post AA as - if begin/end of protein ([/] in PeptideEvidence) - see prepareMULTIpin //id label scannr feature1 ... featureN peptide proteinId1 .. proteinIdM - + + void TopPerc::preparePin(vector& peptide_ids, StringList& feature_set, std::string& enz, TextFile& txt, int min_charge, int max_charge, const char out_sep) + { + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + { + String scan_identifier = getScanIdentifier(it, peptide_ids.begin()); + Int scan_number = getScanNumber(scan_identifier); + double exp_mass = it->getMZ(); + for (vector::const_iterator jt = it->getHits().begin(); jt != it->getHits().end(); ++jt) + { + PeptideHit hit(*jt); // make a copy of the hit to store temporary features + hit.setMetaValue("SpecId", scan_identifier); + hit.setMetaValue("ScanNr", scan_number); + + int label = 1; + if (hit.metaValueExists("target_decoy") && String(hit.getMetaValue("target_decoy")).hasSubstring("decoy")) + { + label = -1; + } + hit.setMetaValue("Label", label); + + int charge = hit.getCharge(); + String unmodified_sequence = hit.getSequence().toUnmodifiedString(); + + double calc_mass = hit.getSequence().getMonoWeight(Residue::Full, charge)/charge; + hit.setMetaValue("CalcMass", calc_mass); + + + hit.setMetaValue("ExpMass", exp_mass); + hit.setMetaValue("mass", exp_mass); + + double score = hit.getScore(); + hit.setMetaValue("score", score); + + int peptide_length = unmodified_sequence.size(); + hit.setMetaValue("peplen", peptide_length); + + for (int i = min_charge; i <= max_charge; ++i) + { + hit.setMetaValue("charge" + String(i), charge == i); + } + + bool enzN = isEnz(hit.getPeptideEvidences().front().getAABefore(), unmodified_sequence.prefix(1)[0], enz); + hit.setMetaValue("enzN", enzN); + bool enzC = isEnz(unmodified_sequence.suffix(1)[0], hit.getPeptideEvidences().front().getAAAfter(), enz); + hit.setMetaValue("enzC", enzC); + int enzInt = countEnzymatic(unmodified_sequence, enz); + hit.setMetaValue("enzInt", enzInt); + + double delta_mass = exp_mass - calc_mass; + hit.setMetaValue("dm", delta_mass); + + double abs_delta_mass = abs(delta_mass); + hit.setMetaValue("absdm", abs_delta_mass); + + //peptide + String sequence = ""; + sequence += String(hit.getPeptideEvidences().front().getAABefore()); // just first peptide evidence + sequence += "." + hit.getSequence().toString() + "."; + sequence += String(hit.getPeptideEvidences().front().getAAAfter()); //just first peptide evidence + hit.setMetaValue("Peptide", sequence); + + //proteinId1 + StringList proteins; + for (vector::const_iterator kt = hit.getPeptideEvidences().begin(); kt != hit.getPeptideEvidences().end(); ++kt) + { + proteins.push_back(kt->getProteinAccession()); + } + hit.setMetaValue("Proteins", ListUtils::concatenate(proteins, out_sep)); + + StringList feats; + for (vector::const_iterator feat = feature_set.begin(); feat != feature_set.end(); ++feat) + { + // Some Hits have no NumMatchedMainIons, and MeanError, etc. values. Have to ignore them! + if (hit.metaValueExists(*feat)) + { + feats.push_back(hit.getMetaValue(*feat).toString()); + } + } + if (feats.size() == feature_set.size()) + { // only if all feats were present add + txt.addLine(ListUtils::concatenate(feats, out_sep)); + } + } + } + } + void TopPerc::prepareCUSTOMpin(vector& peptide_ids, TextFile& txt, vector& user_param_features, char out_sep) { // Create header for the features diff --git a/src/openms/source/APPLICATIONS/ToolHandler.cpp b/src/openms/source/APPLICATIONS/ToolHandler.cpp index d7c814fd1c6..46d82e6e5f8 100755 --- a/src/openms/source/APPLICATIONS/ToolHandler.cpp +++ b/src/openms/source/APPLICATIONS/ToolHandler.cpp @@ -123,6 +123,7 @@ namespace OpenMS tools_map["PeakPickerWavelet"] = Internal::ToolDescription("PeakPickerWavelet", "Signal processing and preprocessing"); tools_map["PepNovoAdapter"] = Internal::ToolDescription("PepNovoAdapter", "Identification"); tools_map["PeptideIndexer"] = Internal::ToolDescription("PeptideIndexer", "ID Processing"); + tools_map["PercolatorAdapter"] = Internal::ToolDescription("PercolatorAdapter", "ID Processing"); tools_map["PhosphoScoring"] = Internal::ToolDescription("PhosphoScoring", "ID Processing"); tools_map["PrecursorIonSelector"] = Internal::ToolDescription("PrecursorIonSelector", "Targeted Experiments"); tools_map["PrecursorMassCorrector"] = Internal::ToolDescription("PrecursorMassCorrector", "Signal processing and preprocessing"); diff --git a/src/topp/PercolatorAdapter.cpp b/src/topp/PercolatorAdapter.cpp new file mode 100644 index 00000000000..2f0efbc77f0 --- /dev/null +++ b/src/topp/PercolatorAdapter.cpp @@ -0,0 +1,660 @@ +// -------------------------------------------------------------------------- +// OpenMS -- Open-Source Mass Spectrometry +// -------------------------------------------------------------------------- +// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, +// ETH Zurich, and Freie Universitaet Berlin 2002-2015. +// +// This software is released under a three-clause BSD license: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of any author or any participating institution +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// For a full list of authors, refer to the file AUTHORS. +// -------------------------------------------------------------------------- +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING +// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// -------------------------------------------------------------------------- +// $Maintainer: Mathias Walzer $ +// $Authors: Andreas Simon, Mathias Walzer, Matthew The $ +// -------------------------------------------------------------------------- +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +using namespace OpenMS; +using namespace std; + +//------------------------------------------------------------- +//Doxygen docu +//------------------------------------------------------------- + +/** + @page TOPP_PercolatorAdapter PercolatorAdapter + + @brief PercolatorAdapter facilitates the input to, the call of and output integration of Percolator. + Percolator (http://per-colator.com/) is a tool to apply semi-supervised learning for peptide + identification from shotgun proteomics datasets. + + @experimental This tool is work in progress and usage and input requirements might change. + +
+ + + + + + + + + + +
potential predecessor tools \f$ \longrightarrow \f$ MSGF+\f$ \longrightarrow \f$ potential successor tools
@ref TOPP_IDFilter @ref TOPP_IDMapper
+
+ +

Percolator is search engine sensitive, i.e. it's input features vary, depending on the search engine.

+ + The command line parameters of this tool are: + @verbinclude TOPP_PercolatorAdapter.cli + INI file documentation of this tool: + @htmlinclude TOPP_PercolatorAdapter.html + + Percolator is written by Lukas Käll (http://per-colator.com/ Copyright Lukas Käll ) +*/ + +// We do not want this class to show up in the docu: +/// @cond TOPPCLASSES + + +class PercolatorAdapter : + public TOPPBase +{ +public: + PercolatorAdapter() : + TOPPBase("PercolatorAdapter", "Facilitate input to Percolator and reintegrate.", false) + { + } + +protected: + struct PercolatorResult + { + String PSMId; + double score; + double qvalue; + double posterior_error_prob; + String peptide; + char preAA; + char postAA; + StringList proteinIds; + + PercolatorResult(const String& pid, const double s, const double q, const String& p, const char pre, const char pos, const StringList& pl): + PSMId (pid), + score (s), + qvalue (q), + peptide (p), + preAA (pre), + postAA (pos), + proteinIds (pl) + { + } + + PercolatorResult(StringList& row): + proteinIds() + { + // peptide sequence + StringList pep; + row[4].split(".", pep); + //TODO test pep size 3 + peptide = pep[1]; + preAA = pep[0]=="-"?'[':pep[0].c_str()[0]; // const char PeptideEvidence::N_TERMINAL_AA = '['; + postAA = pep[2]=="-"?']':pep[2].c_str()[0]; // const char PeptideEvidence::C_TERMINAL_AA = ']'; + // SVM-score + score = row[1].toDouble(); + // q-Value + qvalue = row[2].toDouble(); + // PEP + posterior_error_prob = row[3].toDouble(); + // scannr. as written in preparePIN + PSMId = row[0]; + proteinIds = vector(row.begin()+5,row.end()); + } + + bool operator!=(const PercolatorResult& rhs) const + { + if (PSMId != rhs.PSMId || score != rhs.score || qvalue != rhs.qvalue || + posterior_error_prob != rhs.posterior_error_prob || peptide != rhs.peptide || + proteinIds != rhs.proteinIds) + return true; + return false; + } + + bool operator==(const PercolatorResult& rhs) const + { + return !(operator !=(rhs)); + } + }; + + void registerOptionsAndFlags_() + { + registerInputFileList_("in", "", StringList(), "Input file(s)", true); + setValidFormats_("in", ListUtils::create("mzid,idXML")); + registerInputFileList_("in_decoy", "", StringList(), "Input decoy file(s) in case of separate searches", false); + setValidFormats_("in_decoy", ListUtils::create("mzid,idXML")); + registerOutputFile_("out", "", "", "Output file in idXML format", false); + registerOutputFile_("mzid_out", "", "", "Output file in mzid format", false); + String enzs = "no_enzyme,elastase,pepsin,proteinasek,thermolysin,chymotrypsin,lys-n,lys-c,arg-c,asp-n,glu-c,trypsin"; + registerStringOption_("enzyme", "", "trypsin", "Type of enzyme: "+enzs , false); + setValidStrings_("enzyme", ListUtils::create(enzs)); + registerInputFile_("percolator_executable", "", + // choose the default value according to the platform where it will be executed + #ifdef OPENMS_WINDOWSPLATFORM + "percolator.exe", + #else + "percolator", + #endif + "Percolator executable of the installation e.g. 'percolator.exe'", true, false, ListUtils::create("skipexists") + ); + + //Advanced parameters + registerDoubleOption_("cpos", "", 0.0, "Cpos, penalty for mistakes made on positive examples. Set by cross validation if not specified.", false, true); + registerDoubleOption_("cneg", "", 0.0, "Cneg, penalty for mistakes made on negative examples. Set by cross validation if not specified.", false, true); + registerDoubleOption_("testFDR", "", 0.01, "False discovery rate threshold for evaluating best cross validation result and the reported end result.", false, true); + registerDoubleOption_("trainFDR", "", 0.01, "False discovery rate threshold to define positive examples in training. Set to testFDR if 0.", false, true); + registerIntOption_("maxiter", "", 10, "Maximal number of iterations", false, true); + registerFlag_("quick-validation", "Quicker execution by reduced internal cross-validation.", true); + registerOutputFile_("weights", "", "", "Output final weights to the given file", false, true); + registerInputFile_("init-weights", "", "", "Read initial weights to the given file", false, true); + registerStringOption_("default-direction", "", "", "The most informative feature given as the feature name, can be negated to indicate that a lower value is better.", false, true); + registerIntOption_("verbose", "", 2, "Set verbosity of output: 0=no processing info, 5=all.", false, true); + registerFlag_("unitnorm", "Use unit normalization [0-1] instead of standard deviation normalization", true); + registerFlag_("test-each-iteration", "Measure performance on test set each iteration", true); + registerFlag_("override", "Override error check and do not fall back on default score vector in case of suspect score vector", true); + registerIntOption_("seed", "", 1, "Setting seed of the random number generator.", false, true); + registerIntOption_("doc", "", 0, "Include description of correct features", false, true); + registerFlag_("klammer", "Retention time features calculated as in Klammer et al. Only available if -doc is set", true); + registerFlag_("picked-protein", "Use the picked protein-level FDR to infer protein probabilities.", true); + registerInputFile_("fasta", "", "", "Provide the fasta file as the argument to this flag, which will be used for protein grouping based on an in-silico digest (only valid if option -picked-protein is active).", false, true); + setValidFormats_("fasta", ListUtils::create("FASTA")); + registerStringOption_("decoy-pattern", "", "random", "Define the text pattern to identify the decoy proteins and/or PSMs, set this up if the label that identifies the decoys in the database is not the default (Only valid if option -picked-protein is active).", false, true); + registerFlag_("post-processing-tdc", "Use target-decoy competition to assign q-values and PEPs.", true); + } + + String getScanIdentifier_(vector::iterator it, vector::iterator start) + { + String scan_identifier = it->getMetaValue("spectrum_reference"); + if (scan_identifier.empty()) + { + scan_identifier = String(it->getMetaValue("spectrum_id")); + if (scan_identifier.empty()) + { + scan_identifier = String(it - start + 1); + LOG_WARN << "no known spectrum identifiers, using index [1,n] - use at own risk." << endl; + } + } + return scan_identifier.removeWhitespaces(); + } + + Int getScanNumber_(String scan_identifier) + { + Size idx = 0; + if ((idx = scan_identifier.find("index=")) != string::npos) + { + scan_identifier = scan_identifier.substr(idx + 6); + } + else if ((idx = scan_identifier.find("scan=")) != string::npos) + { + scan_identifier = scan_identifier.substr(idx + 5); + } + return scan_identifier.toInt(); + } + + void readPoutAsMap_(String pout_file, Map >& pep_map) + { + CsvFile csv_file(pout_file, '\t'); + StringList row; + + for (Size i = 1; i < csv_file.rowCount(); ++i) + { + csv_file.getRow(i, row); + PercolatorResult res(row); + String spec_ref = res.PSMId; + if (pep_map.find(spec_ref) == pep_map.end()) + { + pep_map[spec_ref] = vector(); + } + pep_map[spec_ref].push_back(res); + } + } + + ExitCodes main_(int, const char**) + { + //------------------------------------------------------------- + // general variables and data to perform PercolatorAdapter + //------------------------------------------------------------- + vector peptide_ids; + vector protein_ids; + + //------------------------------------------------------------- + // parsing parameters + //------------------------------------------------------------- + const StringList in_list = getStringList_("in"); + const StringList in_decoy = getStringList_("in_decoy"); + LOG_DEBUG << "Input file (of target?): " << ListUtils::concatenate(in_list, ",") << " & " << ListUtils::concatenate(in_decoy, ",") << " (decoy)" << endl; + + const String percolator_executable(getStringOption_("percolator_executable")); + writeDebug_(String("Path to the percolator: ") + percolator_executable, 2); + if (percolator_executable.empty()) //TODO? - TOPPBase::findExecutable after registerInputFile_("percolator_executable"... ??? + { + writeLog_("No percolator executable specified. Aborting!"); + printUsage_(); + return ILLEGAL_PARAMETERS; + } + + const String mzid_out(getStringOption_("mzid_out")); + const String out(getStringOption_("out")); + if (mzid_out.empty() && out.empty()) + { + writeLog_("Fatal error: no output file given (parameter 'out' or 'mzid_out')"); + printUsage_(); + return ILLEGAL_PARAMETERS; + } + + //------------------------------------------------------------- + // read input + //------------------------------------------------------------- + vector > peptide_ids_list; + vector > protein_ids_list; + for (size_t i = 0; i < in_list.size(); ++i) + { + String in = in_list[i]; + FileHandler fh; + FileTypes::Type in_type = fh.getType(in); + if (in_type == FileTypes::IDXML) + { + IdXMLFile().load(in, protein_ids, peptide_ids); + } + else if (in_type == FileTypes::MZIDENTML) + { + LOG_WARN << "Converting from mzid: you might experience loss of information depending on the capabilities of the target format." << endl; + MzIdentMLFile().load(in, protein_ids, peptide_ids); + } + //else catched by TOPPBase:registerInput being mandatory mzid or idxml + + if (peptide_ids.empty()) + { + writeLog_("No or empty input file specified. Aborting!"); + printUsage_(); + return ILLEGAL_PARAMETERS; + } + + //being paranoid about the presence of target decoy denominations, which are crucial to the percolator process + for (vector::iterator pit = peptide_ids.begin(); pit != peptide_ids.end(); ++pit) + { + for (vector::iterator pht = pit->getHits().begin(); pht != pit->getHits().end(); ++pht) + { + // Some Hits have no NumMatchedMainIons, and MeanError, etc. values. Have to ignore them! + if (!pht->metaValueExists("target_decoy")) + { + if (!in_decoy.empty()) + { + pht->setMetaValue("target_decoy", "target"); + } + else + { + writeLog_("No target decoy search results discrimination possible. Aborting!"); + printUsage_(); + return ILLEGAL_PARAMETERS; + } + } + } + } + peptide_ids_list.push_back(peptide_ids); + protein_ids_list.push_back(protein_ids); + } + + //------------------------------------------------------------- + // read more input if necessary + //------------------------------------------------------------- + if (!in_decoy.empty() && in_list.size() == 1) + { + vector decoy_peptide_ids; + vector decoy_protein_ids; + FileHandler fh; + FileTypes::Type in_decoy_type = fh.getType(in_decoy.front()); + if (in_decoy_type == FileTypes::IDXML) + { + IdXMLFile().load(in_decoy.front(), decoy_protein_ids, decoy_peptide_ids); + } + else if (in_decoy_type == FileTypes::MZIDENTML) + { + LOG_WARN << "Converting from mzid: you might experience loss of information depending on the capabilities of the target format." << endl; + MzIdentMLFile().load(in_decoy.front(), decoy_protein_ids, decoy_peptide_ids); + } + + //paranoia check if this comes from the same search engine! (only in the first proteinidentification of the first proteinidentifications vector vector) + { + if (decoy_protein_ids.front().getSearchEngine() != protein_ids_list.front().front().getSearchEngine() ) + { + LOG_WARN << "Warning about differing SearchEngine between target and decoy run" << endl; + } + if (decoy_protein_ids.front().getScoreType() != protein_ids_list.front().front().getScoreType() ) + { + LOG_WARN << "Warning about differing ScoreType between target and decoy run" << endl; + } + if (decoy_protein_ids.front().getPrimaryMSRunPath() != protein_ids_list.front().front().getPrimaryMSRunPath() ) + { + LOG_WARN << "Warning about differing SearchInput between target and decoy run" << endl; + } + if (decoy_protein_ids.front().getSearchParameters().digestion_enzyme != protein_ids_list.front().front().getSearchParameters().digestion_enzyme ) + { + LOG_WARN << "Warning about differing DigestionEnzyme between target and decoy run" << endl; + } + if (decoy_protein_ids.front().getSearchParameters().variable_modifications != protein_ids_list.front().front().getSearchParameters().variable_modifications ) + { + LOG_WARN << "Warning about differing VarMods between target and decoy run" << endl; + } + if (decoy_protein_ids.front().getSearchParameters().fixed_modifications != protein_ids_list.front().front().getSearchParameters().fixed_modifications ) + { + LOG_WARN << "Warning about differing FixMods between target and decoy run" << endl; + } + if (decoy_protein_ids.front().getSearchParameters().charges != protein_ids_list.front().front().getSearchParameters().charges ) + { + LOG_WARN << "Warning about differing SearchCharges between target and decoy run" << endl; + } + if (decoy_protein_ids.front().getSearchParameters().fragment_mass_tolerance != protein_ids_list.front().front().getSearchParameters().fragment_mass_tolerance ) + { + LOG_WARN << "Warning about differing FragTol between target and decoy run" << endl; + } + if (decoy_protein_ids.front().getSearchParameters().precursor_tolerance != protein_ids_list.front().front().getSearchParameters().precursor_tolerance ) + { + LOG_WARN << "Warning about differing PrecTol between target and decoy run" << endl; + } + } + + //being paranoid about the presence of target decoy denominations, which are crucial to the percolator process + for (vector::iterator pit = decoy_peptide_ids.begin(); pit != decoy_peptide_ids.end(); ++pit) + { + for (vector::iterator pht = pit->getHits().begin(); pht != pit->getHits().end(); ++pht) + { + pht->setMetaValue("target_decoy", "decoy"); + //TODO what about proteins - internal target decoy handling is shitty - rework pls + } + } + //TODO check overlap of ids in terms of spectrum id/reference + peptide_ids_list.front().insert( peptide_ids.end(), decoy_peptide_ids.begin(), decoy_peptide_ids.end() ); + protein_ids_list.front().insert( protein_ids.end(), decoy_protein_ids.begin(), decoy_protein_ids.end() ); + writeLog_("Using decoy hits from separate file."); + } + else + { + writeLog_("Using decoy hits from input id file. You did you use a target decoy search, did you?"); + } + + + //------------------------------------------------------------- + // extract search engine and prepare pin + //------------------------------------------------------------- + String se = protein_ids_list.front().front().getSearchEngine(); + for (vector >::iterator pilit = protein_ids_list.begin(); pilit != protein_ids_list.end(); ++pilit) + { + if (se != pilit->front().getSearchEngine()) + { + se = "multiple"; + break; + } + } + LOG_DEBUG << "Registered search engine: " << se << endl; + TextFile txt; + + //TODO introduce min/max charge to parameters for now take available range + int max_charge = 0; + int min_charge = 10; + for (vector >::iterator pilit = peptide_ids_list.begin(); pilit != peptide_ids_list.end(); ++pilit) + { + for (vector::iterator it = pilit->begin(); it != pilit->end(); ++it) + { + for (vector::const_iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) + { + if (hit->getCharge() > max_charge) + { + max_charge = hit->getCharge(); + } + if (hit->getCharge() < min_charge) + { + min_charge = hit->getCharge(); + } + } + } + } + LOG_DEBUG << "Using min/max charges of " << min_charge << "/" << max_charge << endl; + + + StringList feature_set; + feature_set.push_back("SpecId"); + feature_set.push_back("Label"); + feature_set.push_back("ScanNr"); + feature_set.push_back("ExpMass"); + feature_set.push_back("CalcMass"); + feature_set.push_back("mass"); + feature_set.push_back("score"); + feature_set.push_back("peplen"); + for (int i = min_charge; i <= max_charge; ++i) + { + feature_set.push_back("charge" + String(i)); + } + feature_set.push_back("enzN"); + feature_set.push_back("enzC"); + feature_set.push_back("enzInt"); + feature_set.push_back("dm"); + feature_set.push_back("absdm"); + feature_set.push_back("Peptide"); + feature_set.push_back("Proteins"); + + string enz_str = getStringOption_("enzyme"); + txt.addLine(ListUtils::concatenate(feature_set, '\t')); + TopPerc::preparePin(peptide_ids_list.front(), feature_set, enz_str, txt, min_charge, max_charge); + + // create temp directory to store percolator in file pin.tab temporarily + String temp_directory_body = QDir::toNativeSeparators((File::getTempDirectory() + "/" + File::getUniqueName() + "/").toQString()); // body for the tmp files + { + QDir d; + d.mkpath(temp_directory_body.toQString()); + } + String txt_designator = File::getUniqueName(); + String pin_file(temp_directory_body + txt_designator + "_pin.tab"); + String pout_target_file(temp_directory_body + txt_designator + "_target_pout.tab"); + String pout_decoy_file(temp_directory_body + txt_designator + "_decoy_pout.tab"); + txt.store(pin_file); + + QStringList arguments; + // Check all set parameters and get them into arguments StringList + { + arguments << "-U"; + arguments << "-m" << pout_target_file.toQString(); + arguments << "-M" << pout_decoy_file.toQString(); + + double cpos = getDoubleOption_("cpos"); + double cneg = getDoubleOption_("cneg"); + if (cpos != 0.0) arguments << "-p" << String(cpos).toQString(); + if (cneg != 0.0) arguments << "-n" << String(cneg).toQString(); + + double train_FDR = getDoubleOption_("trainFDR"); + double test_FDR = getDoubleOption_("testFDR"); + if (train_FDR != 0.01) arguments << "-F" << String(train_FDR).toQString(); + if (test_FDR != 0.01) arguments << "-t" << String(test_FDR).toQString(); + + Int max_iter = getIntOption_("maxiter"); + if (max_iter != 10) arguments << "-i" << String(max_iter).toQString(); + if (getFlag_("quick-validation")) arguments << "-x"; + if (getFlag_("post-processing-tdc")) arguments << "-Y"; + + String weights_file = getStringOption_("weights"); + String init_weights_file = getStringOption_("init-weights"); + String default_search_direction = getStringOption_("default-direction"); + if (!weights_file.empty()) arguments << "-w" << weights_file.toQString(); + if (!init_weights_file.empty()) arguments << "-W" << init_weights_file.toQString(); + if (!default_search_direction.empty()) arguments << "-V" << default_search_direction.toQString(); + + Int verbose_level = getIntOption_("verbose"); + if (verbose_level != 2) arguments << "-v" << String(verbose_level).toQString(); + if (getFlag_("unitnorm")) arguments << "-u"; + if (getFlag_("test-each-iteration")) arguments << "-R"; + if (getFlag_("override")) arguments << "-O"; + + Int seed = getIntOption_("seed"); + if (seed != 1) arguments << "-S" << String(seed).toQString(); + if (getFlag_("klammer")) arguments << "-K"; + + Int description_of_correct = getIntOption_("doc"); + if (description_of_correct != 0) arguments << "-D" << String(description_of_correct).toQString(); + + String decoy_pattern = getStringOption_("decoy-pattern"); + if (decoy_pattern != "random") arguments << "-P" << decoy_pattern.toQString(); + arguments << pin_file.toQString(); + } + writeLog_("Prepared percolator input."); + + //------------------------------------------------------------- + // run percolator + //------------------------------------------------------------- + // Percolator execution with the executable ant the arguments StringList + int status = QProcess::execute(percolator_executable.toQString(), arguments); // does automatic escaping etc... + if (status != 0) + { + writeLog_("Percolator problem. Aborting! Calling command was: '" + percolator_executable + " \"" + arguments.join("-").toStdString() + "\"."); + // clean temporary files + if (this->debug_level_ < 2) + { + File::removeDirRecursively(temp_directory_body); + LOG_WARN << "Set debug level to >=2 to keep the temporary files at '" << temp_directory_body << "'" << endl; + } + else + { + LOG_WARN << "Keeping the temporary files at '" << temp_directory_body << "'. Set debug level to <2 to remove them." << endl; + } + return EXTERNAL_PROGRAM_ERROR; + } + writeLog_("Executed percolator!"); + + + //------------------------------------------------------------- + // reintegrate pout results + //------------------------------------------------------------- + // when percolator finished calculation, it stores the results -r option (with or without -U) or -m (which seems to be not working) + // WARNING: The -r option cannot be used in conjunction with -U: no peptide level statistics are calculated, redirecting PSM level statistics to provided file instead. + Map > pep_map; + readPoutAsMap_(pout_target_file, pep_map); + readPoutAsMap_(pout_decoy_file, pep_map); + + // As the percolator output file is not needed anymore, the temporary directory is going to be deleted + if (this->debug_level_ < 99) + { + File::removeDirRecursively(temp_directory_body); + } + else + { + LOG_WARN << "Keeping the temporary files at '" << temp_directory_body << "'. Set debug level to <99 to remove them." << endl; + } + + // Add the percolator results to the peptide vector of the original input file + size_t c_debug = 0; + size_t cnt = 0; + for (vector::iterator it = peptide_ids_list.front().begin(); it != peptide_ids_list.front().end(); ++it) + { + String scan_identifier = getScanIdentifier_(it, peptide_ids_list.front().begin()); + if (pep_map.find(scan_identifier) == pep_map.end()) + { + ++c_debug; + LOG_DEBUG << "No suitable PeptideIdentification entry found for .pout entry " << scan_identifier << endl; + continue; + } + + //check each PeptideHit for compliance with one of the PercolatorResults (by sequence) + for (vector::iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) + { + String pis = hit->getSequence().toString(); + for (vector::iterator pr = pep_map.find(scan_identifier)->second.begin(); pr != pep_map.find(scan_identifier)->second.end(); ++pr) + { + if (pis == pr->peptide && + pr->preAA == hit->getPeptideEvidences().front().getAABefore() && + pr->postAA == hit->getPeptideEvidences().front().getAAAfter()) + { + hit->setMetaValue("MS:1001492", pr->score); // svm score + hit->setMetaValue("MS:1001491", pr->qvalue); // percolator q value + hit->setMetaValue("MS:1001493", pr->posterior_error_prob); // percolator pep + ++cnt; + } + } + } + } + LOG_INFO << "No suitable PeptideIdentification for " << c_debug << " out of " << peptide_ids_list.front().size() << endl; + LOG_INFO << "Suitable PeptideHits for " << cnt << " found." << endl; + + for (vector::iterator it = protein_ids_list.front().begin(); it != protein_ids_list.front().end(); ++it) + { + //will not be set because ALL decoy hits got no new score + //it->setSearchEngine("Percolator"); + //it->setScoreType("q-value"); + //it->setHigherScoreBetter(false); + + //TODO add software percolator and PercolatorAdapter + it->setMetaValue("percolator", "PercolatorAdapter"); + ProteinIdentification::SearchParameters sp = it->getSearchParameters(); + //TODO write all percolator parameters as set here in sp + it->setSearchParameters(sp); + } + + // Storing the PeptideHits with calculated q-value, pep and svm score + if (!mzid_out.empty()) + { + MzIdentMLFile().store(mzid_out.toQString().toStdString(), protein_ids_list.front(), peptide_ids_list.front()); + } + if (!out.empty()) + { + IdXMLFile().store(out.toQString().toStdString(), protein_ids_list.front(), peptide_ids_list.front()); + } + + writeLog_("PercolatorAdapter finished successfully!"); + return EXECUTION_OK; + } + +}; + + +int main(int argc, const char** argv) +{ + PercolatorAdapter tool; + + return tool.main(argc, argv); +} + +/// @endcond diff --git a/src/topp/executables.cmake b/src/topp/executables.cmake index 46bd1a3ef9b..c16006716d8 100644 --- a/src/topp/executables.cmake +++ b/src/topp/executables.cmake @@ -72,6 +72,7 @@ PeakPickerHiRes PeakPickerWavelet PepNovoAdapter PeptideIndexer +PercolatorAdapter PhosphoScoring PrecursorIonSelector PrecursorMassCorrector From be54960f42324f5cee5083cf600fce28d38bc946 Mon Sep 17 00:00:00 2001 From: Matthew The Date: Thu, 14 Jul 2016 02:01:52 +0200 Subject: [PATCH 22/41] Created PSMFeatureExtractor util --- cmake/knime_package_support.cmake | 2 +- .../include/OpenMS/ANALYSIS/ID/TopPerc.h | 97 +- src/openms/source/ANALYSIS/ID/TopPerc.cpp | 1622 ++++------------- .../source/APPLICATIONS/ToolHandler.cpp | 2 +- src/topp/PercolatorAdapter.cpp | 549 ++++-- src/utils/PSMFeatureExtractor.cpp | 289 +++ src/utils/TopPerc.cpp | 583 ------ src/utils/executables.cmake | 2 +- 8 files changed, 1041 insertions(+), 2105 deletions(-) create mode 100644 src/utils/PSMFeatureExtractor.cpp delete mode 100644 src/utils/TopPerc.cpp diff --git a/cmake/knime_package_support.cmake b/cmake/knime_package_support.cmake index b9d1e1085e6..161b73615fc 100644 --- a/cmake/knime_package_support.cmake +++ b/cmake/knime_package_support.cmake @@ -119,7 +119,7 @@ add_custom_target( # LuciPhorAdapter COMMAND ${CMAKE_COMMAND} -D SCRIPT_DIR=${SCRIPT_DIRECTORY} -DTOOLNAME=LuciphorAdapter -DPARAM=executable -D CTD_PATH=${CTD_PATH} -P ${SCRIPT_DIRECTORY}remove_parameter_from_ctd.cmake # PercolatorAdapter - COMMAND ${CMAKE_COMMAND} -D SCRIPT_DIR=${SCRIPT_DIRECTORY} -DTOOLNAME=PercolatorAdapter -DPARAM=executable -D CTD_PATH=${CTD_PATH} -P ${SCRIPT_DIRECTORY}remove_parameter_from_ctd.cmake + COMMAND ${CMAKE_COMMAND} -D SCRIPT_DIR=${SCRIPT_DIRECTORY} -DTOOLNAME=PercolatorAdapter -DPARAM=percolator_executable -D CTD_PATH=${CTD_PATH} -P ${SCRIPT_DIRECTORY}remove_parameter_from_ctd.cmake # FidoAdapter COMMAND ${CMAKE_COMMAND} -D SCRIPT_DIR=${SCRIPT_DIRECTORY} -DTOOLNAME=FidoAdapter -DPARAM=fido_executable -D CTD_PATH=${CTD_PATH} -P ${SCRIPT_DIRECTORY}remove_parameter_from_ctd.cmake COMMAND ${CMAKE_COMMAND} -D SCRIPT_DIR=${SCRIPT_DIRECTORY} -DTOOLNAME=FidoAdapter -DPARAM=fidocp_executable -D CTD_PATH=${CTD_PATH} -P ${SCRIPT_DIRECTORY}remove_parameter_from_ctd.cmake diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h index b427e49ffd3..bf413c311b0 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h @@ -29,7 +29,7 @@ // // -------------------------------------------------------------------------- // $Maintainer: Mathias Walzer $ -// $Authors: Mathias Walzer $ +// $Authors: Mathias Walzer, Matthew The $ // -------------------------------------------------------------------------- #ifndef OPENMS_ANALYSIS_ID_TOPPERC_H @@ -44,89 +44,23 @@ #include #include #include -#include namespace OpenMS { class OPENMS_DLLAPI TopPerc { - public: - struct PercolatorResult - { - String PSMId; - double score; - double qvalue; - double posterior_error_prob; - String peptide; - char preAA; - char postAA; - StringList proteinIds; - - PercolatorResult(const String& pid, const double s, const double q, const String& p, const char pre, const char pos, const StringList& pl): - PSMId (pid), - score (s), - qvalue (q), - peptide (p), - preAA (pre), - postAA (pos), - proteinIds (pl) - { - } - - PercolatorResult(StringList& row): - proteinIds() - { - // peptide sequence - StringList pep; - row[4].split(".", pep); - //TODO test pep size 3 - peptide = pep[1]; - preAA = pep[0]=="-"?'[':pep[0].c_str()[0]; // const char PeptideEvidence::N_TERMINAL_AA = '['; - postAA = pep[2]=="-"?']':pep[2].c_str()[0]; // const char PeptideEvidence::C_TERMINAL_AA = ']'; - // SVM-score - score = row[1].toDouble(); - // q-Value - qvalue = row[2].toDouble(); - // PEP - posterior_error_prob = row[3].toDouble(); - // scannr. as written in preparePIN - PSMId = row[0]; - proteinIds = std::vector(row.begin()+5,row.end()); - } - - bool operator!=(const TopPerc::PercolatorResult& rhs) const - { - if (PSMId != rhs.PSMId || score != rhs.score || qvalue != rhs.qvalue || - posterior_error_prob != rhs.posterior_error_prob || peptide != rhs.peptide || - proteinIds != rhs.proteinIds) - return true; - return false; - } - - bool operator==(const TopPerc::PercolatorResult& rhs) const - { - return !(operator !=(rhs)); - } - }; public: - static bool isEnz(const char& n, const char& c, std::string& enz); - static void preparePin(std::vector& peptide_ids, StringList& feature_set, std::string& enz, TextFile& txt, int min_charge, int max_charge, const char out_sep='\t'); - static void prepareCUSTOMpin(std::vector& peptide_ids, TextFile& txt, std::vector& user_param_features, char out_sep='\t'); - static void prepareMSGFpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int min_charge, int max_charge, bool addMHC = false, char out_sep='\t'); - static void prepareXTANDEMpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep='\t'); - static void prepareCOMETpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep='\t'); - static void prepareMASCOTpin(std::vector& peptide_ids, std::string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep='\t'); - static void prepareMULTIpin(std::vector& peptide_ids, ProteinIdentification& protein_id, std::string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep='\t'); - static void prepareCONCATpin(std::vector >& peptide_id_list, std::vector >& protein_id_list, std::string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep='\t'); - static void readPoutAsMap(String pout_file, std::map >& pep_map); - static Size countEnzymatic(String peptide, std::string& enz); - static double rescaleFragmentFeature(double featureValue, int NumMatchedMainIons); - static String getScanIdentifier(std::vector::iterator it, std::vector::iterator start); - static Int getScanNumber(String scan_identifier); - static void assignDeltaScore(std::vector& hits, String score_ref); - static void mergeMULTIids(std::vector >& protein_ids_list, std::vector >& peptide_ids_list, bool skip_checks=false); - + static void mergeMULTISEids(std::vector& all_protein_ids, std::vector& all_peptide_ids, std::vector& new_protein_ids, std::vector& new_peptide_ids, StringList& search_engines_used); + static void concatMULTISEids(std::vector& all_protein_ids, std::vector& all_peptide_ids, std::vector& new_protein_ids, std::vector& new_peptide_ids, StringList& search_engines_used); + + static void addMSGFFeatures(std::vector& peptide_ids, StringList& feature_set); + static void addXTANDEMFeatures(std::vector& peptide_ids, StringList& feature_set); + static void addCOMETFeatures(std::vector& peptide_ids, StringList& feature_set); + static void addMASCOTFeatures(std::vector& peptide_ids, StringList& feature_set); + static void addMULTISEFeatures(std::vector& peptide_ids, StringList& search_engines_used, StringList& feature_set); + static void addCONCATSEFeatures(std::vector& peptide_id_list, StringList& search_engines_used, StringList& feature_set); + struct lq_ProteinHit { inline bool operator() (const ProteinHit& h1, const ProteinHit& h2) @@ -142,12 +76,15 @@ namespace OpenMS return (h1.getProteinAccession() < h2.getProteinAccession()); } }; + - private: + protected: TopPerc(); virtual ~TopPerc(); - - + + static double rescaleFragmentFeature_(double featureValue, int NumMatchedMainIons); + static void assignDeltaScore_(std::vector& hits, String score_ref, String output_ref); + static bool hasMHCEnd_(String peptide); }; diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/TopPerc.cpp index 5ce08d5e227..58038fc812d 100644 --- a/src/openms/source/ANALYSIS/ID/TopPerc.cpp +++ b/src/openms/source/ANALYSIS/ID/TopPerc.cpp @@ -29,7 +29,7 @@ // // -------------------------------------------------------------------------- // $Maintainer: Mathias Walzer $ -// $Authors: Mathias Walzer $ +// $Authors: Mathias Walzer, Matthew The $ // -------------------------------------------------------------------------- #include @@ -39,97 +39,8 @@ using namespace std; namespace OpenMS { - //TODO for all prepare* PSMId as written in PeptideIdentification::spectrum_reference - // and pre/post AA as - if begin/end of protein ([/] in PeptideEvidence) - see prepareMULTIpin - //id label scannr feature1 ... featureN peptide proteinId1 .. proteinIdM - - void TopPerc::preparePin(vector& peptide_ids, StringList& feature_set, std::string& enz, TextFile& txt, int min_charge, int max_charge, const char out_sep) - { - for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) - { - String scan_identifier = getScanIdentifier(it, peptide_ids.begin()); - Int scan_number = getScanNumber(scan_identifier); - double exp_mass = it->getMZ(); - for (vector::const_iterator jt = it->getHits().begin(); jt != it->getHits().end(); ++jt) - { - PeptideHit hit(*jt); // make a copy of the hit to store temporary features - hit.setMetaValue("SpecId", scan_identifier); - hit.setMetaValue("ScanNr", scan_number); - - int label = 1; - if (hit.metaValueExists("target_decoy") && String(hit.getMetaValue("target_decoy")).hasSubstring("decoy")) - { - label = -1; - } - hit.setMetaValue("Label", label); - - int charge = hit.getCharge(); - String unmodified_sequence = hit.getSequence().toUnmodifiedString(); - - double calc_mass = hit.getSequence().getMonoWeight(Residue::Full, charge)/charge; - hit.setMetaValue("CalcMass", calc_mass); - - - hit.setMetaValue("ExpMass", exp_mass); - hit.setMetaValue("mass", exp_mass); - - double score = hit.getScore(); - hit.setMetaValue("score", score); - - int peptide_length = unmodified_sequence.size(); - hit.setMetaValue("peplen", peptide_length); - - for (int i = min_charge; i <= max_charge; ++i) - { - hit.setMetaValue("charge" + String(i), charge == i); - } - - bool enzN = isEnz(hit.getPeptideEvidences().front().getAABefore(), unmodified_sequence.prefix(1)[0], enz); - hit.setMetaValue("enzN", enzN); - bool enzC = isEnz(unmodified_sequence.suffix(1)[0], hit.getPeptideEvidences().front().getAAAfter(), enz); - hit.setMetaValue("enzC", enzC); - int enzInt = countEnzymatic(unmodified_sequence, enz); - hit.setMetaValue("enzInt", enzInt); - - double delta_mass = exp_mass - calc_mass; - hit.setMetaValue("dm", delta_mass); - - double abs_delta_mass = abs(delta_mass); - hit.setMetaValue("absdm", abs_delta_mass); - - //peptide - String sequence = ""; - sequence += String(hit.getPeptideEvidences().front().getAABefore()); // just first peptide evidence - sequence += "." + hit.getSequence().toString() + "."; - sequence += String(hit.getPeptideEvidences().front().getAAAfter()); //just first peptide evidence - hit.setMetaValue("Peptide", sequence); - - //proteinId1 - StringList proteins; - for (vector::const_iterator kt = hit.getPeptideEvidences().begin(); kt != hit.getPeptideEvidences().end(); ++kt) - { - proteins.push_back(kt->getProteinAccession()); - } - hit.setMetaValue("Proteins", ListUtils::concatenate(proteins, out_sep)); - - StringList feats; - for (vector::const_iterator feat = feature_set.begin(); feat != feature_set.end(); ++feat) - { - // Some Hits have no NumMatchedMainIons, and MeanError, etc. values. Have to ignore them! - if (hit.metaValueExists(*feat)) - { - feats.push_back(hit.getMetaValue(*feat).toString()); - } - } - if (feats.size() == feature_set.size()) - { // only if all feats were present add - txt.addLine(ListUtils::concatenate(feats, out_sep)); - } - } - } - } - - void TopPerc::prepareCUSTOMpin(vector& peptide_ids, TextFile& txt, vector& user_param_features, char out_sep) + /* + void TopPerc::prepareCUSTOMpin(vector& peptide_ids, vector& user_param_features) { // Create header for the features string min_featureset = "SpecId, Label, ScanNr"; @@ -169,41 +80,27 @@ namespace OpenMS } } } - - void TopPerc::prepareMSGFpin(vector& peptide_ids, string& enz, TextFile& txt, int min_charge, int max_charge, bool addMHC, char out_sep) + */ + + void TopPerc::addMSGFFeatures(vector& peptide_ids, StringList& feature_set) { - // Create String of the charges for the header of the tab file - stringstream ss; - ss << "Charge" << min_charge << ", "; - for (int j = min_charge + 1; j < max_charge + 1; j++) - { - ss << "Charge" << j << ","; - } - - // Create header for the features - string featureset = "SpecId, Label,ScanNr, RawScore, DeNovoScore,ScoreRatio, Energy,lnEValue,IsotopeError, lnExplainedIonCurrentRatio,lnNTermIonCurrentRatio,lnCTermIonCurrentRatio,lnMS2IonCurrent,Mass,PepLen,dM,absdM,MeanErrorTop7,sqMeanErrorTop7,StdevErrorTop7," + ss.str() ; - StringList txt_header = ListUtils::create(featureset); - if (addMHC) - { - txt_header.push_back("enzN"); - txt_header.push_back("enzC"); - txt_header.push_back("MHCLct"); - txt_header.push_back("Peptide"); - txt_header.push_back("Protein"); - } - else - { - txt_header.push_back("enzN"); - txt_header.push_back("enzC"); - txt_header.push_back("enzInt"); - txt_header.push_back("Peptide"); - txt_header.push_back("Protein"); - } - txt.addLine(ListUtils::concatenate(txt_header, out_sep)); - + feature_set.push_back("MS:1002049"); // unchanged RawScore + feature_set.push_back("MS:1002050"); // unchanged DeNovoScore + feature_set.push_back("MSGF:ScoreRatio"); + feature_set.push_back("MSGF:Energy"); + feature_set.push_back("MSGF:lnEValue"); + feature_set.push_back("IsotopeError"); // unchanged IsotopeError + feature_set.push_back("MSGF:lnExplainedIonCurrentRatio"); + feature_set.push_back("MSGF:lnNTermIonCurrentRatio"); + feature_set.push_back("MSGF:lnCTermIonCurrentRatio"); + feature_set.push_back("MSGF:lnMS2IonCurrent"); + feature_set.push_back("MSGF:MeanErrorTop7"); + feature_set.push_back("MSGF:sqMeanErrorTop7"); + feature_set.push_back("MSGF:StdevErrorTop7"); + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) { - for (vector::const_iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) + for (vector::iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) { // Some Hits have no NumMatchedMainIons, and MeanError, etc. values. Have to ignore them! if (hit->metaValueExists("NumMatchedMainIons")) @@ -211,835 +108,265 @@ namespace OpenMS // only take features from first ranked entries and only with meanerrortop7 != 0.0 if (hit->getMetaValue("MeanErrorTop7").toString().toDouble() != 0.0) { - int charge = hit->getCharge(); - - String scan_identifier = getScanIdentifier(it, peptide_ids.begin()); - Int scan_number = getScanNumber(scan_identifier); - int label = 1; - if ((String(hit->getMetaValue("target_decoy"))).hasSubstring("decoy")) + double raw_score = hit->getMetaValue("MS:1002049").toString().toDouble(); + double denovo_score = hit->getMetaValue("MS:1002050").toString().toDouble(); + + double energy = denovo_score - raw_score; + double score_ratio = raw_score * 10000; + if (denovo_score > 0) { - label = -1; + score_ratio = (raw_score / denovo_score); } - - double rawScore = hit->getMetaValue("MS:1002049").toString().toDouble(); - double denovoScore = hit->getMetaValue("MS:1002050").toString().toDouble(); - - double scoreRatio; - if (denovoScore > 0) - { - scoreRatio = (rawScore / denovoScore); - } - else - { - scoreRatio = rawScore * 10000; - } - - double energy = denovoScore - rawScore; + hit->setMetaValue("MSGF:ScoreRatio", score_ratio); + hit->setMetaValue("MSGF:Energy", energy); + double ln_eval = -log(hit->getMetaValue("MS:1002053").toString().toDouble()); - int isotopeError = hit->getMetaValue("IsotopeError").toString().toInt(); - double lnExplainedIonCurrentRatio = log(hit->getMetaValue("ExplainedIonCurrentRatio").toString().toDouble() + 0.0001); // @andsi: wtf?! - double lnNTermIonCurrentRatio = log(hit->getMetaValue("NTermIonCurrentRatio").toString().toDouble() + 0.0001); // @andsi: wtf?! - double lnCTermIonCurrentRatio = log(hit->getMetaValue("CTermIonCurrentRatio").toString().toDouble() + 0.0001); // @andsi: wtf?! - double lnMS2IonCurrent = log(hit->getMetaValue("MS2IonCurrent").toString().toDouble()); - double expMass = it->getMZ(); - double calcMass = hit->getMetaValue("calcMZ"); - int pepLen = hit->getSequence().toUnmodifiedString().length(); - double dM = (expMass - (isotopeError * Constants::NEUTRON_MASS_U / charge) - calcMass) / expMass; - double absdM = abs(dM); - double meanErrorTop7 = hit->getMetaValue("MeanErrorTop7").toString().toDouble(); - int NumMatchedMainIons = hit->getMetaValue("NumMatchedMainIons").toString().toInt(); - - double stdevErrorTop7 = 0.0; + hit->setMetaValue("MSGF:lnEValue", ln_eval); + + double ln_explained_ion_current_ratio = log(hit->getMetaValue("ExplainedIonCurrentRatio").toString().toDouble() + 0.0001); // @andsi: wtf?! + double ln_NTerm_ion_current_ratio = log(hit->getMetaValue("NTermIonCurrentRatio").toString().toDouble() + 0.0001); // @andsi: wtf?! + double ln_CTerm_ion_current_ratio = log(hit->getMetaValue("CTermIonCurrentRatio").toString().toDouble() + 0.0001); // @andsi: wtf?! + hit->setMetaValue("MSGF:lnExplainedIonCurrentRatio", ln_explained_ion_current_ratio); + hit->setMetaValue("MSGF:lnNTermIonCurrentRatio", ln_NTerm_ion_current_ratio); + hit->setMetaValue("MSGF:lnCTermIonCurrentRatio", ln_CTerm_ion_current_ratio); + + double ln_MS2_ion_current = log(hit->getMetaValue("MS2IonCurrent").toString().toDouble()); + hit->setMetaValue("MSGF:lnMS2IonCurrent", ln_MS2_ion_current); + + double mean_error_top7 = hit->getMetaValue("MeanErrorTop7").toString().toDouble(); + int num_matched_main_ions = hit->getMetaValue("NumMatchedMainIons").toString().toInt(); + + double stdev_error_top7 = 0.0; if (hit->getMetaValue("StdevErrorTop7").toString() != "NaN") { - stdevErrorTop7 = hit->getMetaValue("StdevErrorTop7").toString().toDouble(); - if (stdevErrorTop7 == 0.0) - { - stdevErrorTop7 = meanErrorTop7; - } - } - else - { - LOG_WARN << "Stdeverrortop7 is NaN" << endl; - } - - meanErrorTop7 = rescaleFragmentFeature(meanErrorTop7, NumMatchedMainIons); - double sqMeanErrorTop7 = rescaleFragmentFeature(meanErrorTop7 * meanErrorTop7, NumMatchedMainIons); - stdevErrorTop7 = rescaleFragmentFeature(stdevErrorTop7, NumMatchedMainIons); - - // write 1 for the correct charge, 0 for other charges - // i.e.: charge 3 for charges from 2-5: 0 1 0 0 - stringstream ss; - int i = min_charge; - while (i <= max_charge) - { - if (charge != i) - { - ss << "0" << out_sep; - } - if (charge == i) - { - ss << "1" << out_sep; - } - i++; - } - char aaBefore = hit->getPeptideEvidences().front().getAABefore(); - char aaAfter = hit->getPeptideEvidences().front().getAAAfter(); - - // sequence without modification: "ABC" instead of "ABC[UNIMOD:4]" - String peptide_without_modifications = aaBefore + string(".") + hit->getSequence().toUnmodifiedString() + string(".") + aaAfter; - - // formula taken from percolator msgfplus-converter isEnz(n, c) for trypsin - bool enzN = isEnz(peptide_without_modifications.at(0), peptide_without_modifications.at(2), enz); - bool enzC = isEnz(peptide_without_modifications.at(peptide_without_modifications.size() - 3), peptide_without_modifications.at(peptide_without_modifications.size() - 1), enz); - int enzInt = countEnzymatic(hit->getSequence().toUnmodifiedString(), enz); - - String peptide_with_modifications = aaBefore + string(".") + hit->getSequence().toString() + string(".") + aaAfter; - String protein = hit->getPeptideEvidences().front().getProteinAccession(); - - // One PeptideSpectrumHit with all its features - String lis = scan_identifier + out_sep + String(label) + out_sep + String(scan_number) + out_sep + (String)rawScore + out_sep + - (String)denovoScore + out_sep + (String)scoreRatio + out_sep + (String)energy + out_sep + (String)ln_eval + - out_sep + (String)isotopeError + out_sep + (String)lnExplainedIonCurrentRatio + out_sep + - (String)lnNTermIonCurrentRatio + out_sep + (String)lnCTermIonCurrentRatio + out_sep + (String)lnMS2IonCurrent - + out_sep + (String)expMass + out_sep + (String)pepLen + out_sep + (String)dM + out_sep + (String)absdM + out_sep + - (String)meanErrorTop7 + out_sep + (String)sqMeanErrorTop7 + out_sep + (String)stdevErrorTop7 + - out_sep + String(ss.str()); - if (addMHC) - { - bool suf = false; - static const string arr[] = {"A", "F", "I", "K", "M", "L", "R", "W", "V"}; - vector mhcends (arr, arr + sizeof(arr) / sizeof(arr[0]) ); - for (std::vector::iterator eit = mhcends.begin(); eit != mhcends.end(); ++eit) + stdev_error_top7 = hit->getMetaValue("StdevErrorTop7").toString().toDouble(); + if (stdev_error_top7 == 0.0) { - if (hit->getSequence().toUnmodifiedString().hasSuffix(string(*eit))) - { - suf = true; - break; - } + stdev_error_top7 = mean_error_top7; } - lis = lis + String(enzN) + out_sep + String(enzC) + out_sep - + String(suf) + out_sep + peptide_with_modifications + out_sep + protein + out_sep; } else { - lis = lis + String(enzN) + out_sep + String(enzC) + out_sep - + String(enzInt) + out_sep + peptide_with_modifications + out_sep + protein + out_sep; + stdev_error_top7 = mean_error_top7; + LOG_WARN << "StdevErrorTop7 is NaN, setting as MeanErrorTop7 instead." << endl; } - - // peptide Spectrum Hit pushed to the output file - txt.addLine(lis); + + mean_error_top7 = rescaleFragmentFeature_(mean_error_top7, num_matched_main_ions); + double sq_mean_error_top7 = rescaleFragmentFeature_(mean_error_top7 * mean_error_top7, num_matched_main_ions); + stdev_error_top7 = rescaleFragmentFeature_(stdev_error_top7, num_matched_main_ions); + hit->setMetaValue("MSGF:MeanErrorTop7", mean_error_top7); + hit->setMetaValue("MSGF:sqMeanErrorTop7", sq_mean_error_top7); + hit->setMetaValue("MSGF:StdevErrorTop7", stdev_error_top7); } } } } } - - void TopPerc::prepareXTANDEMpin(vector& peptide_ids, string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep) + + void TopPerc::addXTANDEMFeatures(vector& peptide_ids, StringList& feature_set) { - // Create String of the charges for the header of the tab file - stringstream ss; - ss << "Charge" << min_charge << ", "; - for (int j = min_charge + 1; j < max_charge + 1; j++) - { - - ss << "Charge" << j << ","; - } - // Find out which ions are in XTandem-File and take only these as features - stringstream ss_ion; - if (peptide_ids.front().getHits().front().getMetaValue("a_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("a_ions").toString() != "") - { - ss_ion << "frac_ion_a" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("b_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("b_ions").toString() != "") - { - ss_ion << "frac_ion_b" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("c_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("c_ions").toString() != "") - { - ss_ion << "frac_ion_c" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("x_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("x_ions").toString() != "") - { - ss_ion << "frac_ion_x" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("y_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("y_ions").toString() != "") - { - ss_ion << "frac_ion_y" << ","; - } - if (peptide_ids.front().getHits().front().getMetaValue("z_score").toString() != "" && - peptide_ids.front().getHits().front().getMetaValue("z_ions").toString() != "") - { - ss_ion << "frac_ion_z" << ","; - } - - // Create header for the features - String featureset = "SpecId,Label,ScanNr,hyperscore,deltascore," + ss_ion.str() + - ",Mass,dM,absdM,PepLen," + ss.str() + "enzN,enzC,enzInt,Peptide,Proteins"; - StringList txt_header = ListUtils::create(featureset); - // Insert the header with the features names to the file - txt.addLine(ListUtils::concatenate(txt_header, out_sep)); - - LOG_INFO << "read in target file" << endl; - // get all the features from the target file - for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + StringList ion_types = ListUtils::create("a,b,c,x,y,z"); + StringList ion_types_found; + for (StringList::const_iterator ion = ion_types.begin(); ion != ion_types.end(); ++ion) { - if (it->isHigherScoreBetter()) + if (peptide_ids.front().getHits().front().getMetaValue(*ion + "_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue(*ion + "_ions").toString() != "") { - String scan_identifier = getScanIdentifier(it, peptide_ids.begin()); - Int scan_number = getScanNumber(scan_identifier); - int charge = it->getHits().front().getCharge(); - int label = 1; - double hyperscore = it->getHits().front().getScore(); - // deltascore = hyperscore - nextscore - double deltascore = hyperscore - it->getHits().front().getMetaValue("nextscore").toString().toDouble(); - String sequence = it->getHits().front().getSequence().toString(); - int length = sequence.length(); - - // Find out correct ion types and get its Values - stringstream ss_ion_2; - - if (it->getHits().front().getMetaValue("a_score").toString() != "" && - it->getHits().front().getMetaValue("a_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("a_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("b_score").toString() != "" && - it->getHits().front().getMetaValue("b_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("b_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("c_score").toString() != "" && - it->getHits().front().getMetaValue("c_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("c_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("x_score").toString() != "" && - it->getHits().front().getMetaValue("x_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("x_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("y_score").toString() != "" && - it->getHits().front().getMetaValue("y_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("y_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("z_score").toString() != "" && - it->getHits().front().getMetaValue("z_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("z_ions")) / length << out_sep; - } - double mass = it->getHits().front().getMetaValue("mass"); - double dm = it->getHits().front().getMetaValue("delta"); - double mh = mass + dm; - double absdM = abs(dm); - - // write 1 for the correct charge, 0 for other charges - // i.e.: charge 3 for charges from 2-5: 0 1 0 0 - stringstream ss; - int i = min_charge; - while (i <= max_charge) - { - if (charge != i) - { - ss << "0" << out_sep; - } - if (charge == i) - { - ss << "1" << out_sep; - } - i++; - } - - char aaBefore = it->getHits().front().getPeptideEvidences().front().getAABefore(); - char aaAfter = it->getHits().front().getPeptideEvidences().front().getAAAfter(); - - String peptide = aaBefore + string(".") + sequence + string(".") + aaAfter; - - // formula taken from percolator converter isEnz(n, c) for trypsin - bool enzN = isEnz(peptide.at(0), peptide.at(2), enz); - bool enzC = isEnz(peptide.at(peptide.size() - 3), peptide.at(peptide.size() - 1), enz); - int enzInt = countEnzymatic(sequence, enz); - String protein = it->getHits().front().getPeptideEvidences().front().getProteinAccession(); - - // One PeptideSpectrumHit with all its features - String lis = scan_identifier + "_" + String(charge) + - "_1" + out_sep + String(label) + out_sep + String(scan_number) + out_sep + String(hyperscore) + - out_sep + String(deltascore) + out_sep + ss_ion_2.str() + String(mh) + out_sep + - String(dm) + out_sep + String(absdM) + out_sep + String(length) + out_sep + String(ss.str()) + - String(enzN) + out_sep + String(enzC) + out_sep + String(enzInt) + out_sep + peptide + out_sep + protein; - - // peptide Spectrum Hit pushed to the output file - txt.addLine(lis); + feature_set.push_back("XTANDEM:frac_ion_" + *ion); + ion_types_found.push_back(*ion); } } - - LOG_INFO << "read in decoy file" << endl; - // get all the features from the decoy file + feature_set.push_back("XTANDEM:hyperscore"); + feature_set.push_back("XTANDEM:deltascore"); + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) { - if (it->isHigherScoreBetter()) - { - String scan_identifier = String(it->getMetaValue("spectrum_reference")); - Int scan_number = getScanNumber(scan_identifier); - int charge = it->getHits().front().getCharge(); - int label = -1; - double hyperscore = it->getHits().front().getScore(); - // deltascore = hyperscore - nextscore - double deltascore = hyperscore - it->getHits().front().getMetaValue("nextscore").toString().toDouble(); - String sequence = it->getHits().front().getSequence().toString(); - int length = sequence.length(); - - // Find out correct ion types and get its Values - stringstream ss_ion_2; + double hyper_score = it->getHits().front().getScore(); + double delta_score = hyper_score - it->getHits().front().getMetaValue("nextscore").toString().toDouble(); + it->getHits().front().setMetaValue("XTANDEM:hyperscore", hyper_score); + it->getHits().front().setMetaValue("XTANDEM:deltascore", delta_score); + + String sequence = it->getHits().front().getSequence().toUnmodifiedString(); + int length = sequence.length(); - if (it->getHits().front().getMetaValue("a_score").toString() != "" && it->getHits().front().getMetaValue("a_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("a_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("b_score").toString() != "" && it->getHits().front().getMetaValue("b_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("b_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("c_score").toString() != "" && it->getHits().front().getMetaValue("c_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("c_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("x_score").toString() != "" && it->getHits().front().getMetaValue("x_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("x_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("y_score").toString() != "" && it->getHits().front().getMetaValue("y_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("y_ions")) / length << out_sep; - } - if (it->getHits().front().getMetaValue("z_score").toString() != "" && it->getHits().front().getMetaValue("z_ions").toString() != "") - { - ss_ion_2 << double(it->getHits().front().getMetaValue("z_ions")) / length; - } - double mass = it->getHits().front().getMetaValue("mass"); - double dm = double(it->getHits().front().getMetaValue("delta")); - double mh = mass + dm; - double absdM = abs(dm); - - // write 1 for the correct charge, 0 for other charges - // i.e: charge 3 for charges from 2-5: 0 1 0 0 - stringstream ss; - int i = min_charge; - while (i <= max_charge) + // Find out correct ion types and get its Values + for (StringList::const_iterator ion = ion_types_found.begin(); ion != ion_types_found.end(); ++ion) + { + if (peptide_ids.front().getHits().front().getMetaValue(*ion + "_score").toString() != "" && + peptide_ids.front().getHits().front().getMetaValue(*ion + "_ions").toString() != "") { - if (charge != i) - { - ss << "0" << out_sep; - } - if (charge == i) - { - ss << "1" << out_sep; - } - i++; + // recalculate ion score + double ion_score = it->getHits().front().getMetaValue(*ion + "_ions").toString().toDouble() / length; + it->getHits().front().setMetaValue("XTANDEM:frac_ion_" + *ion, ion_score); } - - char aaBefore = it->getHits().front().getPeptideEvidences().front().getAABefore(); - char aaAfter = it->getHits().front().getPeptideEvidences().front().getAAAfter(); - - String peptide = aaBefore + string(".") + sequence + string(".") + aaAfter; - - // formula taken from percolator converter isEnz(n, c) for trypsin - bool enzN = isEnz(peptide.at(0), peptide.at(2), enz); - bool enzC = isEnz(peptide.at(peptide.size() - 3), peptide.at(peptide.size() - 1), enz); - int enzInt = countEnzymatic(sequence, enz); - String protein = it->getHits().front().getPeptideEvidences().front().getProteinAccession(); - - // One PeptideSpectrumHit with all its features - String lis = scan_identifier + "_" + String(charge) + "_1" + out_sep + String(label) + out_sep + String(scan_number) + out_sep + String(hyperscore) + out_sep + String(deltascore) + out_sep + ss_ion_2.str() + out_sep - + String(mh) + out_sep + String(dm) + out_sep + String(absdM) + out_sep + String(length) + out_sep + ss.str() + out_sep + String(enzN) + out_sep + String(enzC) + out_sep + String(enzInt) + out_sep + peptide + out_sep + protein; - - // peptide Spectrum Hit pushed to the output file - txt.addLine(lis); } } } - void TopPerc::prepareCOMETpin(vector& peptide_ids, string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep) + void TopPerc::addCOMETFeatures(vector& peptide_ids, StringList& feature_set) { - /** -with decoy comet search -id label ScanNr lnrSp deltLCn deltCn lnExpect Xcorr Sp IonFrac Mass PepLen Charge1 Charge2 Charge3 Charge4 Charge5 Charge6 enzN enzC enzInt lnNumSP dM absdM peptide proteinId1 -/home/.../150209_msms4_45_3_1 1 45 2.890372 0.066992 0.055908 2.212066 0.917335 94.189621 0.1875 1541.939549 13 0 0 1 0 0 0 0 0 2 9.352534 -0.000001 0.000001 H.FVIIIRKQTDLPV.I XXX_sp|P30307|MPIP3_HUMAN -/home/.../150209_msms4_55_2_1 -1 55 2.70805 0.257442 0.142884 0.236914 0.884307 65.903358 0.3125 1087.697313 9 0 1 0 0 0 0 0 0 3 8.382976 0.000002 0.000002 F.TIRRKSLLT.S DECOY_XXX_sp|Q5VYS8|TUT7_HUMAN -/home/.../150209_msms4_58_2_1 -1 58 2.079442 0.189541 0.038707 1.094758 0.910897 67.636864 0.2778 1086.695849 10 0 1 0 0 0 0 1 0 4 8.947416 -0.000003 0.000003 K.SKAKKPTKKA.K DECOY_sp|P16403|H12_HUMAN -/home/.../150209_msms4_70_3_1 1 70 0.693147 0.199883 0.161329 0.813617 1.455887 249.314102 0.2708 1399.760349 13 0 0 1 0 0 0 0 1 1 10.987528 0.000009 0.000009 V.KFNGAHIPGSPFK.I sp|Q14315|FLNC_HUMAN - */ - - // Create String of the charges for the header of the tab file - stringstream ss; - ss << "Charge" << min_charge << ", "; - for (int j = min_charge+1; j <= max_charge; j++) - { - ss << "Charge" << j << ","; - } - - String featureset = "id,label,ScanNr,lnrSp,deltLCn,deltCn,lnExpect,Xcorr,Sp,IonFrac,Mass,PepLen," - + ss.str() - + "enzN,enzC,enzInt,lnNumSP,dM,absdM,peptide,proteinId1"; - StringList txt_header = ListUtils::create(featureset); - // Insert the header with the features names to the file - txt.addLine(ListUtils::concatenate(txt_header, out_sep)); - - // get all the feature values + feature_set.push_back("COMET:deltCn"); // recalculated deltCn = (current_XCorr - 2nd_best_XCorr) / max(current_XCorr, 1) + feature_set.push_back("COMET:deltLCn"); // deltLCn = (current_XCorr - worst_XCorr) / max(current_XCorr, 1) + feature_set.push_back("COMET:lnExpect"); // log(E-value) + feature_set.push_back("MS:1002252"); // unchanged XCorr + feature_set.push_back("MS:1002255"); // unchanged Sp = number of candidate peptides + feature_set.push_back("COMET:lnNumSP"); // log(number of candidate peptides) + feature_set.push_back("COMET:lnRankSP"); // log(rank based on Sp score) + feature_set.push_back("COMET:IonFrac"); // matched_ions / total_ions + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) { - double deltaLCn = 0; - for (vector::iterator jt = it->getHits().begin(); jt != it->getHits().end(); ++jt) + double worst_xcorr = 0, second_xcorr = 0; + Int cnt = 0; + for (vector::iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) { - deltaLCn += double(jt->getMetaValue("MS:1002253")); + double xcorr = hit->getMetaValue("MS:1002252").toString().toDouble(); + worst_xcorr = xcorr; + if (cnt == 1) second_xcorr = xcorr; + ++cnt; } - it->sort(); - it->assignRanks(); - String scan_identifier = getScanIdentifier(it, peptide_ids.begin()); - Int scan_number = getScanNumber(scan_identifier); - std::vector hits = it->getHits(); - for (vector::iterator jt = hits.begin(); jt != hits.end(); ++jt) + + for (vector::iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) { - int charge = jt->getCharge(); - int label = 1; - if (jt->metaValueExists("target_decoy") && String(jt->getMetaValue("target_decoy")).hasSubstring("decoy")) - { - label = -1; - } - //Xcorr - String xcorr = String(jt->getMetaValue("MS:1002252")); - //deltCn - String deltaCn = String(jt->getMetaValue("MS:1002253")); - //deltLCn deltaCn between first and last, i.e. sum in peptidehit - //lnExpect - String lnExpect = String(log(double(jt->getMetaValue("MS:1002257")))); - //Sp - String sp = String(jt->getMetaValue("MS:1002255")); - //lnrSp log n rank Sp - String lnrSp = String(log(double(jt->getMetaValue("MS:1002256")))); - //IonFrac - String ionfrac = double(jt->getMetaValue("MS:1002258"))/double(jt->getMetaValue("MS:1002259")); - //Mass - double mass = jt->getSequence().getMonoWeight(Residue::Full, charge)/charge; - //PepLen - int peplen = jt->getSequence().size(); //NB comet assigns peplen 0 to decoys? - //Chargen - StringList chargen; - // write 1 for the correct charge, 0 for other charges - for (int i = min_charge; i <= max_charge; ++i) - { - if (charge != i) - { - chargen.push_back("0"); - } - else - { - chargen.push_back("1"); - } - } - //enzN - bool enzN = isEnz(jt->getPeptideEvidences().front().getAABefore(), jt->getSequence().getPrefix(1).toString().c_str()[0], enz); - //enzC - bool enzC = isEnz(jt->getSequence().getSuffix(1).toString().c_str()[0], jt->getPeptideEvidences().front().getAAAfter(), enz); - //enzInt - int enzInt = countEnzymatic(jt->getSequence().toUnmodifiedString(), enz); - //lnNumSP - //this is practically not obtainable, as this seems to be the logn of the number of - //internally matched decoy or target hits to that spectrum query depending on the current hit itself - //is approximated by number of matched peptides - String lnNumSP = String(log(double(jt->getMetaValue("num_matched_peptides")))); - //dM - double dm = it->getMZ() - mass; - //absdM - double absdm = abs(dm); - //peptide - String sequence = ""; - sequence += String(jt->getPeptideEvidences().front().getAABefore()); // just first peptide evidence - sequence += jt->getSequence().toString(); - sequence += String(jt->getPeptideEvidences().front().getAAAfter()); //just first peptide evidence - //proteinId1 - String pepevid = ""; - for (vector::const_iterator kt = jt->getPeptideEvidences().begin(); kt != jt->getPeptideEvidences().end(); ++kt) - { - pepevid += kt->getProteinAccession(); - } - - StringList row; - row.push_back(scan_identifier); - row.push_back(label); - row.push_back(String(scan_number)); - row.push_back(lnrSp); - row.push_back(deltaLCn); - row.push_back(deltaCn); - row.push_back(lnExpect); - row.push_back(xcorr); - row.push_back(sp); - row.push_back(ionfrac); - row.push_back(String(mass)); - row.push_back(String(peplen)); - row.push_back(ListUtils::concatenate(chargen, out_sep)); - row.push_back(String(enzN)); - row.push_back(String(enzC)); - row.push_back(String(enzInt)); - row.push_back(lnNumSP); - row.push_back(String(dm)); - row.push_back(String(absdm)); - row.push_back(sequence); - row.push_back(pepevid); - - txt.addLine(ListUtils::concatenate(row, out_sep)); + double xcorr = hit->getMetaValue("MS:1002252").toString().toDouble(); + double delta_cn = (xcorr - second_xcorr) / max(1.0, xcorr); + double delta_last_cn = (xcorr - worst_xcorr) / max(1.0, xcorr); + hit->setMetaValue("COMET:deltCn", delta_cn); + hit->setMetaValue("COMET:deltLCn", delta_last_cn); + + double ln_expect = log(hit->getMetaValue("MS:1002257").toString().toDouble()); + hit->setMetaValue("COMET:lnExpect", ln_expect); + + double ln_num_sp = log(hit->getMetaValue("MS:1002255").toString().toDouble()); + double ln_rank_sp = log(max(1.0, hit->getMetaValue("MS:1002256").toString().toDouble())); + hit->setMetaValue("COMET:lnNumSP", ln_num_sp); + hit->setMetaValue("COMET:lnRankSP", ln_rank_sp); + + double num_matched_ions = hit->getMetaValue("MS:1002258").toString().toDouble(); + double num_total_ions = hit->getMetaValue("MS:1002259").toString().toDouble(); + double ion_frac = num_matched_ions / num_total_ions; + hit->setMetaValue("COMET:IonFrac", ion_frac); } } } - void TopPerc::prepareMASCOTpin(vector& peptide_ids, string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep) - { - /** -Features 1-9 Represent the Basic Feature Set and Features 10-18 Represent the Extended Feature Set As Used in Mascot Percolator - -feature abbreviation feature description -1. mass Calculated monoisotopic mass of the identified peptide. -2. charge Precursor ion charge -3. mScore Mascot score -4. dScore Mascot score minus Mascot score of next best nonisobaric peptide hit -5. deltaM Calculated minus observed peptide mass (in Dalton and ppm). -6. absDeltaM Absolute value of calculated minus observed peptide mass (in Dalton and ppm) -7. isoDeltaM Calculated minus observed peptide mass, isotope error corrected (in Dalton and ppm) -8. uniquePeps None (0), one (1), two or more (2) distinct peptide sequences match same protein -9. mc Missed tryptic cleavages -10. totInt Total ion intensity (log) -11. intMatchedTot Total matched ion intensity (log) -12. relIntMatchedTot Total matched ion intensity divided by total ion intensity -13. binom Peptide Score as described in ref 28 -14. fragMassError Mean fragment mass error (in Dalton and ppm) -15. absFragMassError Mean absolute fragment mass error (in Dalton and ppm) -16. fracIonsMatched Fraction of calculated ions matched (per ion series) -17. seqCov Sequence coverage of matched ions (per ion series) -18. intMatched Matched ion intensity (per ion series) - */ - // Create String of the charges for the header of the tab file - stringstream ss; - ss << "Charge" << min_charge << ", "; - for (int j = min_charge+1; j <= max_charge; j++) - { - ss << "Charge" << j << ","; - } - - String featureset = "id,label,ScanNr,mass," - + ss.str() - + "mScore,dScore,deltaMass, absDeltaMass, uniqueToProt, enzN, enzC, enzInt, mod,sequence,protein"; - StringList txt_header = ListUtils::create(featureset); - // Insert the header with the features names to the file - txt.addLine(ListUtils::concatenate(txt_header, out_sep)); - - // get all the feature values + /** + Features 1-9 Represent the Basic Feature Set + + feature abbreviation feature description + 1. mass Calculated monoisotopic mass of the identified peptide. Present as generic feature. + 2. charge Precursor ion charge. Present as generic feature. + 3. mScore Mascot score. Added in this function. + 4. dScore Mascot score minus Mascot score of next best nonisobaric peptide hit. Added in this function. + 5. deltaM Calculated minus observed peptide mass (in Dalton and ppm). Present as generic feature. + 6. absDeltaM Absolute value of calculated minus observed peptide mass (in Dalton and ppm). Present as generic feature. + 7. isoDeltaM Calculated minus observed peptide mass, isotope error corrected (in Dalton and ppm) + 8. uniquePeps None (0), one (1), two or more (2) distinct peptide sequences match same protein. Added in this function. + 9. mc Missed tryptic cleavages. Present as generic feature. + + Features 10-18 Represent the Extended Feature Set As Used in Mascot Percolator + + feature abbreviation feature description + 10. totInt Total ion intensity (log). Not available in mascot adapter. + 11. intMatchedTot Total matched ion intensity (log). Not available in mascot adapter. + 12. relIntMatchedTot Total matched ion intensity divided by total ion intensity. Not available in mascot adapter. + 13. binom Peptide Score as described in ref 28. Not available in mascot adapter. + 14. fragMassError Mean fragment mass error (in Dalton and ppm). Not available in mascot adapter. + 15. absFragMassError Mean absolute fragment mass error (in Dalton and ppm). Not available in mascot adapter. + 16. fracIonsMatched Fraction of calculated ions matched (per ion series). Not available in mascot adapter. + 17. seqCov Sequence coverage of matched ions (per ion series). Not available in mascot adapter. + 18. intMatched Matched ion intensity (per ion series). Not available in mascot adapter. + */ + void TopPerc::addMASCOTFeatures(vector& peptide_ids, StringList& feature_set) + { + feature_set.push_back("MS:1001171"); // unchanged mScore + feature_set.push_back("MASCOT:delta_score"); // delta score based on mScore + feature_set.push_back("MASCOT:uniqueToProt"); // bool: peptide unique to protein + feature_set.push_back("MASCOT:hasMod"); // bool: has post translational modification + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) { - double deltaLCn = 0; - for (vector::iterator jt = it->getHits().begin(); jt != it->getHits().end(); ++jt) - { - deltaLCn += double(jt->getMetaValue("MS:1002253")); - } it->sort(); it->assignRanks(); - String scan_identifier = getScanIdentifier(it, peptide_ids.begin()); - Int scan_number = getScanNumber(scan_identifier); - it->sort(); - it->assignRanks(); - std::vector hits = it->getHits(); - assignDeltaScore(hits, "MS:1001171"); - for (vector::iterator jt = hits.begin(); jt != hits.end(); ++jt) + assignDeltaScore_(hits, "MS:1001171", "MASCOT:delta_score"); + + for (vector::iterator hit = hits.begin(); hit != hits.end(); ++hit) { - int label = 1; - if (jt->metaValueExists("target_decoy") && String(jt->getMetaValue("target_decoy")).hasSubstring("decoy")) - { - label = -1; - } - int charge = jt->getCharge(); - double mass = jt->getSequence().getMonoWeight(Residue::Full, charge)/charge; - //Chargen - StringList chargen; - // write 1 for the correct charge, 0 for other charges - for (int i = min_charge; i <= max_charge; ++i) - { - if (charge != i) - { - chargen.push_back("0"); - } - else - { - chargen.push_back("1"); - } - } - double mScore = double(jt->getMetaValue("MS:1001171")); - double dScore = double(jt->getMetaValue("delta_score")); - double dm = it->getMZ() - mass; - double absdm = abs(dm); - //no isoDeltaM - no isotope error info available from mascot adapter - //no uniquePeps - no info from mascot substitute with sequence to protein uniqueness - String uniquePeps = "0"; - if (String(jt->getMetaValue("protein_references")) == "unique") - { - uniquePeps = "1"; - } - bool enzN = isEnz(jt->getPeptideEvidences().front().getAABefore(), jt->getSequence().getPrefix(1).toString().c_str()[0], enz); - //enzC - bool enzC = isEnz(jt->getSequence().getSuffix(1).toString().c_str()[0], jt->getPeptideEvidences().front().getAAAfter(), enz); - //enzInt - int enzInt = countEnzymatic(jt->getSequence().toUnmodifiedString(), enz); - //no totInt info available from mascot adapter - //no intMatchedTot info available from mascot adapter - //no relIntMatchedTot info available from mascot adapter - //no binom info available from mascot adapter - //no fragMassError info available from mascot adapter - //no absFragMassError info available from mascot adapter - //no fracIonsMatched info available from mascot adapter - //no seqCov info available from mascot adapter - //no intMatched info available from mascot adapter - - bool mod = jt->getSequence().isModified(); - String sequence = ""; - sequence += String(jt->getPeptideEvidences().front().getAABefore()); // just first peptide evidence - sequence += jt->getSequence().toString(); - sequence += String(jt->getPeptideEvidences().front().getAAAfter()); //just first peptide evidence - //proteinId1 - String pepevid = ""; - for (vector::const_iterator kt = jt->getPeptideEvidences().begin(); kt != jt->getPeptideEvidences().end(); ++kt) - { - pepevid += kt->getProteinAccession(); - } - - StringList row; - row.push_back(scan_identifier); - row.push_back(label); - row.push_back(String(scan_number)); - row.push_back(String(mass)); - row.push_back(ListUtils::concatenate(chargen, out_sep)); - row.push_back(String(mScore)); - row.push_back(String(dScore)); - row.push_back(String(dm)); - row.push_back(String(absdm)); - row.push_back(String(uniquePeps)); - row.push_back(String(enzN)); - row.push_back(String(enzC)); - row.push_back(String(enzInt)); - row.push_back(String(mod)); - row.push_back(sequence); - row.push_back(pepevid); - - txt.addLine(ListUtils::concatenate(row, out_sep)); + bool unique_to_protein = (String(hit->getMetaValue("protein_references")) == "unique"); + bool has_mod = hit->getSequence().isModified(); + hit->setMetaValue("COMET:uniqueToProt", unique_to_protein); + hit->setMetaValue("COMET:hasMod", has_mod); } } } - // Function adapted from Enzyme.h in Percolator converter - bool TopPerc::isEnz(const char& n, const char& c, string& enz) - { - if (enz == "trypsin") - { - return ((n == 'K' || n == 'R') && c != 'P') || n == '-' || c == '-'; - } - else if (enz == "chymotrypsin") - { - return ((n == 'F' || n == 'W' || n == 'Y' || n == 'L') && c != 'P') || n == '-' || c == '-'; - } - else if (enz == "thermolysin") - { - return ((c == 'A' || c == 'F' || c == 'I' || c == 'L' || c == 'M' - || c == 'V' || (n == 'R' && c == 'G')) && n != 'D' && n != 'E') || n == '-' || c == '-'; - } - else if (enz == "proteinasek") - { - return (n == 'A' || n == 'E' || n == 'F' || n == 'I' || n == 'L' - || n == 'T' || n == 'V' || n == 'W' || n == 'Y') || n == '-' || c == '-'; - } - else if (enz == "pepsin") - { - return ((c == 'F' || c == 'L' || c == 'W' || c == 'Y' || n == 'F' - || n == 'L' || n == 'W' || n == 'Y') && n != 'R') || n == '-' || c == '-'; - } - else if (enz == "elastase") - { - return ((n == 'L' || n == 'V' || n == 'A' || n == 'G') && c != 'P') - || n == '-' || c == '-'; - } - else if (enz == "lys-n") - { - return (c == 'K') - || n == '-' || c == '-'; - } - else if (enz == "lys-c") - { - return ((n == 'K') && c != 'P') - || n == '-' || c == '-'; + void TopPerc::addCONCATSEFeatures(vector& peptide_ids, StringList& search_engines_used, StringList& feature_set) + { + for (StringList::iterator it = search_engines_used.begin(); it != search_engines_used.end(); ++it) { + feature_set.push_back("CONCAT:" + *it); } - else if (enz == "arg-c") - { - return ((n == 'R') && c != 'P') - || n == '-' || c == '-'; - } - else if (enz == "asp-n") - { - return (c == 'D') - || n == '-' || c == '-'; - } - else if (enz == "glu-c") - { - return ((n == 'E') && (c != 'P')) - || n == '-' || c == '-'; - } - else - { - return true; - } - } - - // Function adapted from Enzyme.h in Percolator converter - Size TopPerc::countEnzymatic(String peptide, string& enz) - { - Size count = 0; - for (Size ix = 1; ix < peptide.size(); ++ix) + LOG_INFO << "Using " << ListUtils::concatenate(search_engines_used, ", ") << " as source for search engine specific features." << endl; + feature_set.push_back("CONCAT:lnEvalue"); + feature_set.push_back("CONCAT:deltaLnEvalue"); + + // feature values have been set in concatMULTISEids + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) { - if (isEnz(peptide[ix - 1], peptide[ix], enz)) - { - ++count; - } + it->sort(); + it->assignRanks(); + assignDeltaScore_(it->getHits(), "CONCAT:lnEvalue", "CONCAT:deltaLnEvalue"); } - return count; } - // Function adapted from MsgfplusReader in Percolator converter - double TopPerc::rescaleFragmentFeature(double featureValue, int NumMatchedMainIons) + void TopPerc::mergeMULTISEids(vector& all_protein_ids, vector& all_peptide_ids, vector& new_protein_ids, vector& new_peptide_ids, StringList& search_engines_used) { - // Rescale the fragment features to penalize features calculated by few ions - int numMatchedIonLimit = 7; - int numerator = (1 + numMatchedIonLimit) * (1 + numMatchedIonLimit); - int denominator = (1 + (min)(NumMatchedMainIons, numMatchedIonLimit)) * (1 + (min)(NumMatchedMainIons, numMatchedIonLimit)); - return featureValue * ((double)numerator / denominator); - } - - String TopPerc::getScanIdentifier(vector::iterator it, vector::iterator start) - { - String scan_identifier = it->getMetaValue("spectrum_reference"); - if (scan_identifier.empty()) + LOG_DEBUG << "creating spectrum map" << endl; + + std::map unified; + //setup map of merge characteristics per spectrum + for (vector::iterator pit = all_peptide_ids.begin(); pit != all_peptide_ids.end(); ++pit) { - scan_identifier = String(it->getMetaValue("spectrum_id")); - if (scan_identifier.empty()) - { - scan_identifier = String(it - start + 1); - LOG_WARN << "no known spectrum identifiers, using index [1,n] - use at own risk." << endl; - } + PeptideIdentification ins = *pit; + ins.setScoreType("multiple"); + ins.setIdentifier("TopPerc_multiple_SE_input"); + String spectrum_reference = ins.getMetaValue("spectrum_reference"); + unified[spectrum_reference] = ins; } - return scan_identifier.removeWhitespaces(); - } - - Int TopPerc::getScanNumber(String scan_identifier) - { - Size idx = 0; - if ((idx = scan_identifier.find("index=")) != std::string::npos) + + String search_engine = new_protein_ids.front().getSearchEngine(); + if (!ListUtils::contains(search_engines_used, search_engine)) { - scan_identifier = scan_identifier.substr(idx + 6); + search_engines_used.push_back(search_engine); } - else if ((idx = scan_identifier.find("scan=")) != std::string::npos) + + for (vector::iterator pit = new_peptide_ids.begin(); pit != new_peptide_ids.end(); ++pit) { - scan_identifier = scan_identifier.substr(idx + 5); - } - return scan_identifier.toInt(); - } - - void TopPerc::assignDeltaScore(vector& hits, String score_ref) - { - if (!hits.empty()) - { - vector::iterator prev = hits.begin(); - double prev_score = double(prev->getMetaValue(score_ref)); - for (vector::iterator jt = hits.begin()+1; jt != hits.end(); ++jt) + PeptideIdentification ins = *pit; + //prepare for merge + for (vector::iterator hit = ins.getHits().begin(); hit != ins.getHits().end(); ++hit) { - double cur_score = double(jt->getMetaValue(score_ref)); - double value = prev_score - cur_score; - prev->setMetaValue("delta_score",value); - prev = jt; + hit->setScore(1); } - (hits.end()-1)->setMetaValue("delta_score",0.0); //if last hit or only one hit - } - } - - void TopPerc::mergeMULTIids(vector >& protein_ids_list, vector >& peptide_ids_list, bool skip_checks) - { - //both input parameters must correspond - if (peptide_ids_list.size() != protein_ids_list.size()) - { - throw Exception::ElementNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, "Protein and Peptide Identification mismatch"); - } - - //search parameters of all runs must correspond (considering front() of each only) - if (!skip_checks) - { - for (Size i=1; i < protein_ids_list.size(); ++i) + ins.setScoreType("multiple"); + ins.setIdentifier("TopPerc_multiple_SE_input"); + String spectrum_reference = ins.getMetaValue("spectrum_reference"); + //merge in unified map + if (unified.find(spectrum_reference) == unified.end()) { - if( protein_ids_list[i-1].front().getSearchParameters().db != protein_ids_list[i].front().getSearchParameters().db ) - { - throw Exception::ElementNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, protein_ids_list[i-1].front().getSearchParameters().db+"!="+protein_ids_list[i].front().getSearchParameters().db); - } + unified[spectrum_reference] = ins; } - } - - LOG_DEBUG << "creating spectrum map" << endl; - - //setup map of merge characteristics per spectrum - std::map unified; - - std::string common = "q-value_score, expect_score"; - StringList commonMetaValues = ListUtils::create(common); - for (std::vector >::iterator pilit = peptide_ids_list.begin(); pilit != peptide_ids_list.end(); ++pilit) - { - String SE = protein_ids_list[distance(peptide_ids_list.begin(), pilit)].front().getSearchEngine(); - for (vector::iterator pit = pilit->begin(); pit != pilit->end(); ++pit) + else { - PeptideIdentification ins = *pit; - //prepare for merge + //find corresponding hit for (vector::iterator hit = ins.getHits().begin(); hit != ins.getHits().end(); ++hit) { - //move score from each hit to meta value - hit->setMetaValue(SE + ":" + ins.getScoreType(), hit->getScore()); - //set score in each hit to #SE hits - hit->setScore(1); - //rename common meta values (to SE:commonmetavaluename) - for (Size i = 0; i < commonMetaValues.size(); ++i) + for (vector::iterator merger = unified[spectrum_reference].getHits().begin(); merger != unified[spectrum_reference].getHits().end(); ++merger) { - if (hit->metaValueExists(commonMetaValues[i])) + if (hit->getSequence()==merger->getSequence()) { - DataValue val = hit->getMetaValue(commonMetaValues[i]); - hit->setMetaValue(SE+":"+commonMetaValues[i],val); - hit->removeMetaValue(commonMetaValues[i]); - } - } - } - ins.setScoreType("multiple"); - ins.setIdentifier("TopPerc_multiple_SE_input"); - String spectrum_reference = ins.getMetaValue("spectrum_reference"); - //merge in unified map - if (unified.find(spectrum_reference) == unified.end()) - { - unified[spectrum_reference] = ins; - } - else - { - //find corresponding hit - for (vector::iterator hit = ins.getHits().begin(); hit != ins.getHits().end(); ++hit) - { - for (vector::iterator merger = unified[spectrum_reference].getHits().begin(); merger != unified[spectrum_reference].getHits().end(); ++merger) - { - if (hit->getSequence()==merger->getSequence()) - { - //care for peptide evidences!! set would be okay if checked for same search db in parameters, + //care for peptide evidences!! set would be okay if checked for same search db in parameters, // vector pev; // pev.reserve(max(hit->getPeptideEvidences().size(),merger->getPeptideEvidences().size())); // std::vector::iterator uni; @@ -1050,27 +377,28 @@ feature abbreviation feature description // TopPerc::lq_PeptideEvidence); // pev.resize(uni-pev.begin()); // merger->setPeptideEvidences(pev); - //There is no mutable getPeptideEvidences() accessor in PeptideHit - above will not werk, but so long: - //Implying PeptideIndexer was applied (with the same search db each) will care for that all PeptideEvidences from two hits with equal AASequence are the same + //There is no mutable getPeptideEvidences() accessor in PeptideHit - above will not werk, but so long: + //Implying PeptideIndexer was applied (with the same search db each) will care for that all PeptideEvidences from two hits with equal AASequence are the same - //merge meta values - vector< String > keys; - hit->getKeys(keys); - for (vector::const_iterator kt = keys.begin(); kt != keys.end(); ++kt) + //merge meta values + StringList keys; + hit->getKeys(keys); + for (StringList::const_iterator kt = keys.begin(); kt != keys.end(); ++kt) + { + if (!merger->metaValueExists(*kt)) { - if (!merger->metaValueExists(*kt)) - { - merger->setMetaValue(*kt, hit->getMetaValue(*kt)); - } + merger->setMetaValue(*kt, hit->getMetaValue(*kt)); } - merger->setScore(merger->getScore() + hit->getScore()); - break; } + // adds up the number of hits, as the score of each separate hit is 1 + merger->setScore(merger->getScore() + hit->getScore()); + break; } } } } } + LOG_DEBUG << "filled spectrum map" << endl; std::vector swip; swip.reserve(unified.size()); @@ -1079,395 +407,207 @@ feature abbreviation feature description { swip.push_back(it->second); } - peptide_ids_list.front().swap(swip); - peptide_ids_list.resize(1); - LOG_DEBUG << "Now containing " << peptide_ids_list.front().size() << " spectra identifications."<< endl; - + all_peptide_ids.swap(swip); + LOG_DEBUG << "Now containing " << all_peptide_ids.size() << " spectra identifications."<< endl; + + LOG_DEBUG << "merging search parameters" << endl; //care for search parameters!! + all_protein_ids.front().setIdentifier("TopPerc_multiple_SE_input"); + all_protein_ids.front().setDateTime(DateTime::currentDateTime()); - std::vector swop; - swop.push_back(ProteinIdentification()); - swop.back().setIdentifier("TopPerc_multiple_SE_input"); - swop.back().setDateTime(DateTime::currentDateTime()); - swop.back().setSearchParameters(protein_ids_list.front().front().getSearchParameters()); - for (std::vector >::iterator it = protein_ids_list.begin(); it != protein_ids_list.end(); ++it) + std::vector& all_protein_hits = all_protein_ids.front().getHits(); + std::vector& new_protein_hits = all_protein_ids.front().getHits(); + all_protein_hits.resize(new_protein_hits.size() + all_protein_hits.size()); + + std::sort(new_protein_hits.begin(), new_protein_hits.end(), TopPerc::lq_ProteinHit()); + LOG_DEBUG << "Sorted " << new_protein_hits.size() << " new ProteinHits." << endl; + LOG_DEBUG << "Melting with " << all_protein_hits.size() << " previous ProteinHits." << endl; + if (all_protein_hits.empty()) { - std::vector v; - v.resize(swop.front().getHits().size() + it->front().getHits().size()); - std::vector::iterator uni; - std::sort(it->front().getHits().begin(),it->front().getHits().end(), TopPerc::lq_ProteinHit()); - LOG_DEBUG << "Sorted next part of the ProteinHits." << endl; - LOG_DEBUG << "Melting with that many previous ProteinHits. " << swop.front().getHits().size() << endl; - if (swop.front().getHits().empty()) - { - v.swap(it->front().getHits()); - } - else - { - uni = std::set_union(swop.front().getHits().begin(), swop.front().getHits().end(), - it->front().getHits().begin(),it->front().getHits().end(), v.begin(), - TopPerc::lq_ProteinHit()); - v.resize(uni-v.begin()); - } - LOG_DEBUG << "Melting ProteinHits." << endl; - - swap(swop.front().getHits(),v); - LOG_DEBUG << "Done with next ProteinHits." << endl; - - ProteinIdentification::SearchParameters sp = it->front().getSearchParameters(); - String SE = it->front().getSearchEngine(); - LOG_DEBUG << "Melting Parameters from " << SE << " into MetaInfo." << endl; - {//insert into MetaInfo as SE:param - swop.front().setMetaValue("SE:"+SE,it->front().getSearchEngineVersion()); - swop.front().setMetaValue(SE+":db",sp.db); - swop.front().setMetaValue(SE+":db_version",sp.db_version); - swop.front().setMetaValue(SE+":taxonomy",sp.taxonomy); - swop.front().setMetaValue(SE+":charges",sp.charges); - swop.front().setMetaValue(SE+":fixed_modifications",ListUtils::concatenate(sp.fixed_modifications, ",")); - swop.front().setMetaValue(SE+":variable_modifications",ListUtils::concatenate(sp.variable_modifications, ",")); - swop.front().setMetaValue(SE+":missed_cleavages",sp.missed_cleavages); - swop.front().setMetaValue(SE+":fragment_mass_tolerance",sp.fragment_mass_tolerance); - swop.front().setMetaValue(SE+":fragment_mass_tolerance_ppm",sp.fragment_mass_tolerance_ppm); - swop.front().setMetaValue(SE+":precursor_tolerance",sp.precursor_tolerance); - swop.front().setMetaValue(SE+":precursor_mass_tolerance_ppm",sp.precursor_mass_tolerance_ppm); - swop.front().setMetaValue(SE+":digestion_enzyme",sp.digestion_enzyme.getName()); - } - swop.front().setPrimaryMSRunPath(it->front().getPrimaryMSRunPath()); - swop.front().setSearchEngine("multiple"); + all_protein_hits.swap(new_protein_hits); + } + else + { + std::vector::iterator uni = std::set_union( + all_protein_hits.begin(), all_protein_hits.end(), + new_protein_hits.begin(), new_protein_hits.end(), all_protein_hits.begin(), + TopPerc::lq_ProteinHit() ); + all_protein_hits.resize(uni - all_protein_hits.begin()); + } + LOG_DEBUG << "Melting ProteinHits." << endl; + LOG_DEBUG << "Done with next ProteinHits." << endl; + + ProteinIdentification::SearchParameters sp = new_protein_ids.front().getSearchParameters(); + String SE = new_protein_ids.front().getSearchEngine(); + LOG_DEBUG << "Melting Parameters from " << SE << " into MetaInfo." << endl; + {//insert into MetaInfo as SE:param + all_protein_ids.front().setMetaValue("SE:"+SE,new_protein_ids.front().getSearchEngineVersion()); + all_protein_ids.front().setMetaValue(SE+":db",sp.db); + all_protein_ids.front().setMetaValue(SE+":db_version",sp.db_version); + all_protein_ids.front().setMetaValue(SE+":taxonomy",sp.taxonomy); + all_protein_ids.front().setMetaValue(SE+":charges",sp.charges); + all_protein_ids.front().setMetaValue(SE+":fixed_modifications",ListUtils::concatenate(sp.fixed_modifications, ",")); + all_protein_ids.front().setMetaValue(SE+":variable_modifications",ListUtils::concatenate(sp.variable_modifications, ",")); + all_protein_ids.front().setMetaValue(SE+":missed_cleavages",sp.missed_cleavages); + all_protein_ids.front().setMetaValue(SE+":fragment_mass_tolerance",sp.fragment_mass_tolerance); + all_protein_ids.front().setMetaValue(SE+":fragment_mass_tolerance_ppm",sp.fragment_mass_tolerance_ppm); + all_protein_ids.front().setMetaValue(SE+":precursor_tolerance",sp.precursor_tolerance); + all_protein_ids.front().setMetaValue(SE+":precursor_mass_tolerance_ppm",sp.precursor_mass_tolerance_ppm); + all_protein_ids.front().setMetaValue(SE+":digestion_enzyme",sp.digestion_enzyme.getName()); + all_protein_ids.front().setPrimaryMSRunPath(new_protein_ids.front().getPrimaryMSRunPath()); + all_protein_ids.front().setSearchEngine("multiple"); LOG_DEBUG << "Done with next Parameters." << endl; } - protein_ids_list.front().swap(swop); - protein_ids_list.resize(1); + LOG_DEBUG << "All merging finished." << endl; - } - - void TopPerc::prepareMULTIpin(vector& peptide_ids, ProteinIdentification& protein_id, string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep) + + void TopPerc::concatMULTISEids(vector& all_protein_ids, vector& all_peptide_ids, vector& new_protein_ids, vector& new_peptide_ids, StringList& search_engines_used) { - //------------------------------------------------------------- - // header - //------------------------------------------------------------- - // Create String of the charges for the header of the tab file - stringstream ss; - ss << "Charge" << min_charge << ", "; - for (int j = min_charge+1; j <= max_charge; j++) + String search_engine = new_protein_ids.front().getSearchEngine(); + if (!ListUtils::contains(search_engines_used, search_engine)) { - ss << "Charge" << j << ","; + search_engines_used.push_back(search_engine); } - - StringList ses_used; - StringList se_specifics; - StringList keys; - protein_id.getKeys(keys); - - if (ListUtils::contains(keys, "SE:MS-GF+")) - { - ses_used.push_back("MS-GF+"); - se_specifics.push_back("MS:1002049"); // rawscore - se_specifics.push_back("MS:1002053"); // evalue - } - if (ListUtils::contains(keys, "SE:Mascot")) - { - ses_used.push_back("Mascot"); - se_specifics.push_back("Mascot_score"); - se_specifics.push_back("EValue"); - } - if (ListUtils::contains(keys, "SE:Comet")) - { - ses_used.push_back("Comet"); - se_specifics.push_back("MS:1002252"); //xcorr - se_specifics.push_back("MS:1002257"); //evalue - } - if (ListUtils::contains(keys, "SE:XTandem")) - { - ses_used.push_back("XTandem"); - se_specifics.push_back("XTandem_score"); - se_specifics.push_back("E-Value"); - } - - LOG_INFO << "Using " << ListUtils::concatenate(ses_used, ", ") << " as source for search engine specific features." << endl; - - String featureset = "id,label,ScanNr," - + ListUtils::concatenate(se_specifics, ",") + "," - + ss.str() - + "ionfrac,mass,enzN,enzC,enzInt,numHits,dM,absdM,PepLen,peptide,proteinId1"; - StringList txt_header = ListUtils::create(featureset); - // Insert the header with the features names to the file - txt.addLine(ListUtils::concatenate(txt_header, out_sep)); - - //------------------------------------------------------------- - // values - //------------------------------------------------------------- - // get all the feature values - for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + + for (vector::iterator pit = new_peptide_ids.begin(); pit != new_peptide_ids.end(); ++pit) { - it->sort(); - it->assignRanks(); - String scan_identifier = getScanIdentifier(it, peptide_ids.begin()); - Int scan_number = getScanNumber(scan_identifier); - StringList idents; - std::vector hits = it->getHits(); - for (vector::iterator jt = hits.begin(); jt != hits.end(); ++jt) + for (vector::iterator hit = pit->getHits().begin(); hit != pit->getHits().end(); ++hit) { - int charge = jt->getCharge(); - int label = 1; - if (jt->metaValueExists("target_decoy") && String(jt->getMetaValue("target_decoy")).hasSubstring("decoy")) + double evalue = 1000.0; + if (search_engine == "MS-GF+") { - label = -1; + hit->setMetaValue("CONCAT:" + search_engine, hit->getMetaValue("MS:1002049")); // rawscore + evalue = hit->getMetaValue("MS:1002049").toString().toDouble(); // evalue } - - StringList sesp; - for (StringList::iterator s = se_specifics.begin(); s != se_specifics.end(); ++s) + if (search_engine == "Mascot") { - if (jt->metaValueExists(*s)) - sesp.push_back(String(jt->getMetaValue(*s))); - else - sesp.push_back("-1"); + hit->setMetaValue("CONCAT:" + search_engine, hit->getMetaValue("MS:1001171")); // mscore + evalue = hit->getMetaValue("EValue").toString().toDouble(); } - - StringList chargen; - // write 1 for the correct charge, 0 for other charges - for (int i = min_charge; i <= max_charge; ++i) + if (search_engine == "Comet") { - if (charge != i) - { - chargen.push_back("0"); - } - else - { - chargen.push_back("1"); - } + hit->setMetaValue("CONCAT:" + search_engine, hit->getMetaValue("MS:1002252")); // xcorr + evalue = hit->getMetaValue("MS:1002257").toString().toDouble(); } - - //IonFrac - String ionfrac = String(double(jt->getMetaValue("matched_intensity"))/double(jt->getMetaValue("sum_intensity"))); // also consider "matched_ion_number"/"peak_number" - //Mass - double mass = jt->getSequence().getMonoWeight(Residue::Full, charge)/charge; - //enzN - bool enzN = isEnz(jt->getPeptideEvidences().front().getAABefore(), jt->getSequence().getPrefix(1).toString().c_str()[0], enz); - //enzC - bool enzC = isEnz(jt->getSequence().getSuffix(1).toString().c_str()[0], jt->getPeptideEvidences().front().getAAAfter(), enz); - //enzInt - int enzInt = countEnzymatic(jt->getSequence().toUnmodifiedString(), enz); - //numHits - int numHits = jt->getScore(); - //dM - double dm = it->getMZ() - mass; - //absdM - double absdm = abs(dm); - //PepLen - int peplen = jt->getSequence().size(); - //peptide - String sequence = ""; - //replace flanking aa if [ or ] with - - char pb = jt->getPeptideEvidences().front().getAABefore(); - sequence += pb=='['?"-.":String(pb)+"."; // just first peptide evidence - sequence += jt->getSequence().toString(); - char pa = jt->getPeptideEvidences().front().getAAAfter(); - sequence += pa==']'?".-":"."+String(pa); // just first peptide evidence - //proteinId1 - StringList pepevid; - for (vector::const_iterator kt = jt->getPeptideEvidences().begin(); kt != jt->getPeptideEvidences().end(); ++kt) + if (search_engine == "XTandem") { - pepevid.push_back(kt->getProteinAccession()); + hit->setMetaValue("CONCAT:" + search_engine, hit->getMetaValue("XTandem_score")); // xtandem score + evalue = hit->getMetaValue("E-Value").toString().toDouble(); } - - StringList row; - row.push_back(scan_identifier); - row.push_back(label); - row.push_back(String(scan_number)); - row.push_back(ListUtils::concatenate(sesp, out_sep)); - row.push_back(ListUtils::concatenate(chargen, out_sep)); - row.push_back(ionfrac); - row.push_back(String(mass)); - row.push_back(String(enzN)); - row.push_back(String(enzC)); - row.push_back(String(enzInt)); - row.push_back(String(numHits)); - row.push_back(String(dm)); - row.push_back(String(absdm)); - row.push_back(String(peplen)); - row.push_back(sequence); - row.push_back(ListUtils::concatenate(pepevid, out_sep)); - - txt.addLine(ListUtils::concatenate(row, out_sep)); + hit->setMetaValue("CONCAT:lnEvalue", log(evalue)); // log(evalue) } } + all_peptide_ids.insert(all_peptide_ids.end(), new_peptide_ids.begin(), new_peptide_ids.end()); + + ProteinIdentification::SearchParameters sp = new_protein_ids.front().getSearchParameters(); + String SE = new_protein_ids.front().getSearchEngine(); + LOG_DEBUG << "Melting Parameters from " << SE << " into MetaInfo." << endl; + {//insert into MetaInfo as SE:param + all_protein_ids.front().setMetaValue("SE:"+SE,new_protein_ids.front().getSearchEngineVersion()); + all_protein_ids.front().setMetaValue(SE+":db",sp.db); + all_protein_ids.front().setMetaValue(SE+":db_version",sp.db_version); + all_protein_ids.front().setMetaValue(SE+":taxonomy",sp.taxonomy); + all_protein_ids.front().setMetaValue(SE+":charges",sp.charges); + all_protein_ids.front().setMetaValue(SE+":fixed_modifications",ListUtils::concatenate(sp.fixed_modifications, ",")); + all_protein_ids.front().setMetaValue(SE+":variable_modifications",ListUtils::concatenate(sp.variable_modifications, ",")); + all_protein_ids.front().setMetaValue(SE+":missed_cleavages",sp.missed_cleavages); + all_protein_ids.front().setMetaValue(SE+":fragment_mass_tolerance",sp.fragment_mass_tolerance); + all_protein_ids.front().setMetaValue(SE+":fragment_mass_tolerance_ppm",sp.fragment_mass_tolerance_ppm); + all_protein_ids.front().setMetaValue(SE+":precursor_tolerance",sp.precursor_tolerance); + all_protein_ids.front().setMetaValue(SE+":precursor_mass_tolerance_ppm",sp.precursor_mass_tolerance_ppm); + all_protein_ids.front().setMetaValue(SE+":digestion_enzyme",sp.digestion_enzyme.getName()); + all_protein_ids.front().setPrimaryMSRunPath(new_protein_ids.front().getPrimaryMSRunPath()); + all_protein_ids.front().setSearchEngine("multiple"); + LOG_DEBUG << "Done with next Parameters." << endl; + } } - void TopPerc::prepareCONCATpin(vector >& peptide_id_list, vector >& protein_id_list, string& enz, TextFile& txt, int min_charge, int max_charge, char out_sep) + void TopPerc::addMULTISEFeatures(vector& peptide_ids, StringList& search_engines_used, StringList& feature_set) { - //------------------------------------------------------------- - // header - //------------------------------------------------------------- - // Create String of the charges for the header of the tab file - stringstream ss; - ss << "Charge" << min_charge << ", "; - for (int j = min_charge+1; j <= max_charge; j++) + if (ListUtils::contains(search_engines_used, "MS-GF+")) { - ss << "Charge" << j << ","; + feature_set.push_back("MS:1002049"); // rawscore + feature_set.push_back("MS:1002053"); // evalue } - - StringList ses_used; - for (vector >::iterator it = protein_id_list.begin(); it != protein_id_list.end(); ++it) + if (ListUtils::contains(search_engines_used, "Mascot")) { - ses_used.push_back(it->front().getSearchEngine()); + feature_set.push_back("MS:1001171"); + feature_set.push_back("EValue"); } + if (ListUtils::contains(search_engines_used, "Comet")) + { + feature_set.push_back("MS:1002252"); //xcorr + feature_set.push_back("MS:1002257"); //evalue + } + if (ListUtils::contains(search_engines_used, "XTandem")) + { + feature_set.push_back("XTandem_score"); + feature_set.push_back("E-Value"); + } + feature_set.push_back("MULTI:ionFrac"); + feature_set.push_back("MULTI:numHits"); + + LOG_INFO << "Using " << ListUtils::concatenate(search_engines_used, ", ") << " as source for search engine specific features." << endl; - LOG_INFO << "Using " << ListUtils::concatenate(ses_used, ", ") << " as source for search engine specific features." << endl; - - String featureset = "id,label,ScanNr," - + ListUtils::concatenate(ses_used, ",") + "," - + ss.str() - + "Evalue,ionfrac,mass,enzN,enzC,enzInt,numHits,dM,absdM,PepLen,peptide,proteinId1"; - StringList txt_header = ListUtils::create(featureset); - // Insert the header with the features names to the file - txt.addLine(ListUtils::concatenate(txt_header, out_sep)); - - //------------------------------------------------------------- - // values - //------------------------------------------------------------- // get all the feature values - for (vector >::iterator pit = peptide_id_list.begin(); pit != peptide_id_list.end(); ++pit) + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) { - Size i = std::distance(peptide_id_list.begin(),pit); - String se = protein_id_list[i].front().getSearchEngine(); - for (vector::iterator it = pit->begin(); it != pit->end(); ++it) - { - it->sort(); - it->assignRanks(); - String scan_identifier = getScanIdentifier(it, pit->begin()); - Int scan_number = getScanNumber(scan_identifier); - std::vector hits = it->getHits(); - for (vector::iterator jt = hits.begin(); jt != hits.end(); ++jt) - { - int charge = jt->getCharge(); - int label = 1; - if (jt->metaValueExists("target_decoy") && String(jt->getMetaValue("target_decoy")).hasSubstring("decoy")) - { - label = -1; - } - - StringList sesp; - String ev; - for (StringList::iterator s = ses_used.begin(); s != ses_used.end(); ++s) - { - if ((*s) == se) - { - if (se == "MS-GF+") - { - sesp.push_back(jt->getMetaValue("MS:1002049")); // rawscore - ev = jt->getMetaValue("MS:1002053"); // evalue - } - if (se == "Mascot") - { - sesp.push_back(jt->getMetaValue("Mascot_score")); - ev = jt->getMetaValue("EValue"); - } - if (se == "Comet") - { - sesp.push_back(jt->getMetaValue("MS:1002252")); //xcorr - ev = jt->getMetaValue("MS:1002257"); //evalue - } - if (se == "XTandem") - { - sesp.push_back(jt->getMetaValue("XTandem_score")); - ev = jt->getMetaValue("E-Value"); - } - } - else - sesp.push_back("-1"); - } - - StringList chargen; - // write 1 for the correct charge, 0 for other charges - for (int i = min_charge; i <= max_charge; ++i) - { - if (charge != i) - { - chargen.push_back("0"); - } - else - { - chargen.push_back("1"); - } - } - - //IonFrac - String ionfrac = String(double(jt->getMetaValue("matched_intensity"))/double(jt->getMetaValue("sum_intensity"))); // also consider "matched_ion_number"/"peak_number" - //Mass - double mass = jt->getSequence().getMonoWeight(Residue::Full, charge)/charge; - //enzN - bool enzN = isEnz(jt->getPeptideEvidences().front().getAABefore(), jt->getSequence().getPrefix(1).toString().c_str()[0], enz); - //enzC - bool enzC = isEnz(jt->getSequence().getSuffix(1).toString().c_str()[0], jt->getPeptideEvidences().front().getAAAfter(), enz); - //enzInt - int enzInt = countEnzymatic(jt->getSequence().toUnmodifiedString(), enz); - //numHits - int numHits = jt->getScore(); - //dM - double dm = it->getMZ() - mass; - //absdM - double absdm = abs(dm); - //PepLen - int peplen = jt->getSequence().size(); - //peptide - String sequence = ""; - //replace flanking aa if [ or ] with - - char pb = jt->getPeptideEvidences().front().getAABefore(); - sequence += pb=='['?"-.":String(pb)+"."; // just first peptide evidence - sequence += jt->getSequence().toString(); - char pa = jt->getPeptideEvidences().front().getAAAfter(); - sequence += pa==']'?".-":"."+String(pa); // just first peptide evidence - //proteinId1 - StringList pepevid; - for (vector::const_iterator kt = jt->getPeptideEvidences().begin(); kt != jt->getPeptideEvidences().end(); ++kt) - { - pepevid.push_back(kt->getProteinAccession()); - } - - StringList row; - row.push_back(scan_identifier); - row.push_back(label); - row.push_back(String(scan_number)); - row.push_back(ListUtils::concatenate(sesp, out_sep)); - row.push_back(ListUtils::concatenate(chargen, out_sep)); - row.push_back(ev); - row.push_back(ionfrac); - row.push_back(String(mass)); - row.push_back(String(enzN)); - row.push_back(String(enzC)); - row.push_back(String(enzInt)); - row.push_back(String(numHits)); - row.push_back(String(dm)); - row.push_back(String(absdm)); - row.push_back(String(peplen)); - row.push_back(sequence); - row.push_back(ListUtils::concatenate(pepevid, out_sep)); - - txt.addLine(ListUtils::concatenate(row, out_sep)); - } + it->sort(); + it->assignRanks(); + for (vector::iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) + { + double ion_frac = hit->getMetaValue("matched_intensity").toString().toDouble() / hit->getMetaValue("sum_intensity").toString().toDouble(); // also consider "matched_ion_number"/"peak_number" + hit->setMetaValue("MULTI:ionFrac", ion_frac); + + int num_hits = hit->getScore(); + hit->setMetaValue("MULTI:numHits", num_hits); } } } - void TopPerc::readPoutAsMap(String pout_file, map >& pep_map) + // Function adapted from MsgfplusReader in Percolator converter + double TopPerc::rescaleFragmentFeature_(double featureValue, int NumMatchedMainIons) { - CsvFile csv_file(pout_file, '\t'); - StringList row; - - for (Size i = 1; i < csv_file.rowCount(); ++i) + // Rescale the fragment features to penalize features calculated by few ions + int numMatchedIonLimit = 7; + int numerator = (1 + numMatchedIonLimit) * (1 + numMatchedIonLimit); + int denominator = (1 + (min)(NumMatchedMainIons, numMatchedIonLimit)) * (1 + (min)(NumMatchedMainIons, numMatchedIonLimit)); + return featureValue * ((double)numerator / denominator); + } + + void TopPerc::assignDeltaScore_(vector& hits, String score_ref, String output_ref) + { + if (!hits.empty()) + { + vector::iterator prev = hits.begin(); + double prev_score = double(prev->getMetaValue(score_ref)); + for (vector::iterator hit = hits.begin()+1; hit != hits.end(); ++hit) + { + double cur_score = double(hit->getMetaValue(score_ref)); + double value = prev_score - cur_score; + prev->setMetaValue(output_ref, value); + prev = hit; + } + (hits.end()-1)->setMetaValue(output_ref, 0.0); //if last hit or only one hit + } + } + + bool TopPerc::hasMHCEnd_(String peptide) + { + bool suf = false; + static const string arr[] = {"A", "F", "I", "K", "M", "L", "R", "W", "V"}; + vector mhcends (arr, arr + sizeof(arr) / sizeof(arr[0]) ); + for (std::vector::iterator eit = mhcends.begin(); eit != mhcends.end(); ++eit) { - csv_file.getRow(i, row); - PercolatorResult res(row); - String spec_ref = res.PSMId; - if (pep_map.find(spec_ref) == pep_map.end()) + if (peptide.hasSuffix(string(*eit))) { - pep_map[spec_ref] = vector(); + suf = true; + break; } - pep_map[spec_ref].push_back(res); } + return suf; } } diff --git a/src/openms/source/APPLICATIONS/ToolHandler.cpp b/src/openms/source/APPLICATIONS/ToolHandler.cpp index 46d82e6e5f8..5856b62696a 100755 --- a/src/openms/source/APPLICATIONS/ToolHandler.cpp +++ b/src/openms/source/APPLICATIONS/ToolHandler.cpp @@ -219,6 +219,7 @@ namespace OpenMS util_map["OpenSwathWorkflow"] = Internal::ToolDescription("OpenSwathWorkflow", util_category); util_map["PeakPickerIterative"] = Internal::ToolDescription("PeakPickerIterative", "Signal processing and preprocessing"); //util_map["PeakPickerRapid"] = Internal::ToolDescription("PeakPickerRapid", "Signal processing and preprocessing"); + util_map["PSMFeatureExtractor"] = Internal::ToolDescription("PSMFeatureExtractor", util_category); util_map["QCCalculator"] = Internal::ToolDescription("QCCalculator", util_category); util_map["QCEmbedder"] = Internal::ToolDescription("QCEmbedder", util_category); util_map["QCExtractor"] = Internal::ToolDescription("QCExtractor", util_category); @@ -238,7 +239,6 @@ namespace OpenMS util_map["SvmTheoreticalSpectrumGeneratorTrainer"] = Internal::ToolDescription("SvmTheoreticalSpectrumGeneratorTrainer", util_category); util_map["TICCalculator"] = Internal::ToolDescription("TICCalculator", util_category); util_map["TransformationEvaluation"] = Internal::ToolDescription("TransformationEvaluation", util_category); - util_map["TopPerc"] = Internal::ToolDescription("TopPerc", util_category); util_map["XMLValidator"] = Internal::ToolDescription("XMLValidator", util_category); // STOP! insert your tool in alphabetical order for easier maintenance (only tools requiring the GUI lib should be added below) diff --git a/src/topp/PercolatorAdapter.cpp b/src/topp/PercolatorAdapter.cpp index 2f0efbc77f0..c75a912e2fa 100644 --- a/src/topp/PercolatorAdapter.cpp +++ b/src/topp/PercolatorAdapter.cpp @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include @@ -50,6 +49,7 @@ #include #include #include +#include using namespace OpenMS; using namespace std; @@ -165,14 +165,16 @@ class PercolatorAdapter : void registerOptionsAndFlags_() { - registerInputFileList_("in", "", StringList(), "Input file(s)", true); + bool is_required = true; + bool is_advanced_option = true; + registerInputFileList_("in", "", StringList(), "Input file(s)", is_required); setValidFormats_("in", ListUtils::create("mzid,idXML")); - registerInputFileList_("in_decoy", "", StringList(), "Input decoy file(s) in case of separate searches", false); + registerInputFileList_("in_decoy", "", StringList(), "Input decoy file(s) in case of separate searches", !is_required); setValidFormats_("in_decoy", ListUtils::create("mzid,idXML")); - registerOutputFile_("out", "", "", "Output file in idXML format", false); - registerOutputFile_("mzid_out", "", "", "Output file in mzid format", false); + registerOutputFile_("out", "", "", "Output file in idXML format", !is_required); + registerOutputFile_("mzid_out", "", "", "Output file in mzid format", !is_required); String enzs = "no_enzyme,elastase,pepsin,proteinasek,thermolysin,chymotrypsin,lys-n,lys-c,arg-c,asp-n,glu-c,trypsin"; - registerStringOption_("enzyme", "", "trypsin", "Type of enzyme: "+enzs , false); + registerStringOption_("enzyme", "", "trypsin", "Type of enzyme: "+enzs , !is_required); setValidStrings_("enzyme", ListUtils::create(enzs)); registerInputFile_("percolator_executable", "", // choose the default value according to the platform where it will be executed @@ -181,33 +183,34 @@ class PercolatorAdapter : #else "percolator", #endif - "Percolator executable of the installation e.g. 'percolator.exe'", true, false, ListUtils::create("skipexists") + "Percolator executable of the installation e.g. 'percolator.exe'", is_required, !is_advanced_option, ListUtils::create("skipexists") ); //Advanced parameters - registerDoubleOption_("cpos", "", 0.0, "Cpos, penalty for mistakes made on positive examples. Set by cross validation if not specified.", false, true); - registerDoubleOption_("cneg", "", 0.0, "Cneg, penalty for mistakes made on negative examples. Set by cross validation if not specified.", false, true); - registerDoubleOption_("testFDR", "", 0.01, "False discovery rate threshold for evaluating best cross validation result and the reported end result.", false, true); - registerDoubleOption_("trainFDR", "", 0.01, "False discovery rate threshold to define positive examples in training. Set to testFDR if 0.", false, true); - registerIntOption_("maxiter", "", 10, "Maximal number of iterations", false, true); - registerFlag_("quick-validation", "Quicker execution by reduced internal cross-validation.", true); - registerOutputFile_("weights", "", "", "Output final weights to the given file", false, true); - registerInputFile_("init-weights", "", "", "Read initial weights to the given file", false, true); - registerStringOption_("default-direction", "", "", "The most informative feature given as the feature name, can be negated to indicate that a lower value is better.", false, true); - registerIntOption_("verbose", "", 2, "Set verbosity of output: 0=no processing info, 5=all.", false, true); - registerFlag_("unitnorm", "Use unit normalization [0-1] instead of standard deviation normalization", true); - registerFlag_("test-each-iteration", "Measure performance on test set each iteration", true); - registerFlag_("override", "Override error check and do not fall back on default score vector in case of suspect score vector", true); - registerIntOption_("seed", "", 1, "Setting seed of the random number generator.", false, true); - registerIntOption_("doc", "", 0, "Include description of correct features", false, true); - registerFlag_("klammer", "Retention time features calculated as in Klammer et al. Only available if -doc is set", true); - registerFlag_("picked-protein", "Use the picked protein-level FDR to infer protein probabilities.", true); - registerInputFile_("fasta", "", "", "Provide the fasta file as the argument to this flag, which will be used for protein grouping based on an in-silico digest (only valid if option -picked-protein is active).", false, true); + registerDoubleOption_("cpos", "", 0.0, "Cpos, penalty for mistakes made on positive examples. Set by cross validation if not specified.", !is_required, is_advanced_option); + registerDoubleOption_("cneg", "", 0.0, "Cneg, penalty for mistakes made on negative examples. Set by cross validation if not specified.", !is_required, is_advanced_option); + registerDoubleOption_("testFDR", "", 0.01, "False discovery rate threshold for evaluating best cross validation result and the reported end result.", !is_required, is_advanced_option); + registerDoubleOption_("trainFDR", "", 0.01, "False discovery rate threshold to define positive examples in training. Set to testFDR if 0.", !is_required, is_advanced_option); + registerIntOption_("maxiter", "", 10, "Maximal number of iterations", !is_required, is_advanced_option); + registerFlag_("quick-validation", "Quicker execution by reduced internal cross-validation.", is_advanced_option); + registerOutputFile_("weights", "", "", "Output final weights to the given file", !is_required, is_advanced_option); + registerInputFile_("init-weights", "", "", "Read initial weights to the given file", !is_required, is_advanced_option); + registerStringOption_("default-direction", "", "", "The most informative feature given as the feature name, can be negated to indicate that a lower value is better.", !is_required, is_advanced_option); + registerIntOption_("verbose", "", 2, "Set verbosity of output: 0=no processing info, 5=all.", !is_required, is_advanced_option); + registerFlag_("unitnorm", "Use unit normalization [0-1] instead of standard deviation normalization", is_advanced_option); + registerFlag_("test-each-iteration", "Measure performance on test set each iteration", is_advanced_option); + registerFlag_("override", "Override error check and do not fall back on default score vector in case of suspect score vector", is_advanced_option); + registerIntOption_("seed", "", 1, "Setting seed of the random number generator.", !is_required, is_advanced_option); + registerIntOption_("doc", "", 0, "Include description of correct features", !is_required, is_advanced_option); + registerFlag_("klammer", "Retention time features calculated as in Klammer et al. Only available if -doc is set", is_advanced_option); + registerFlag_("picked-protein", "Use the picked protein-level FDR to infer protein probabilities.", is_advanced_option); + registerInputFile_("fasta", "", "", "Provide the fasta file as the argument to this flag, which will be used for protein grouping based on an in-silico digest (only valid if option -picked-protein is active).", !is_required, is_advanced_option); setValidFormats_("fasta", ListUtils::create("FASTA")); - registerStringOption_("decoy-pattern", "", "random", "Define the text pattern to identify the decoy proteins and/or PSMs, set this up if the label that identifies the decoys in the database is not the default (Only valid if option -picked-protein is active).", false, true); - registerFlag_("post-processing-tdc", "Use target-decoy competition to assign q-values and PEPs.", true); + registerStringOption_("decoy-pattern", "", "random", "Define the text pattern to identify the decoy proteins and/or PSMs, set this up if the label that identifies the decoys in the database is not the default (Only valid if option -picked-protein is active).", !is_required, is_advanced_option); + registerFlag_("post-processing-tdc", "Use target-decoy competition to assign q-values and PEPs.", is_advanced_option); } + // TODO: add file specific scan identifiers String getScanIdentifier_(vector::iterator it, vector::iterator start) { String scan_identifier = it->getMetaValue("spectrum_reference"); @@ -223,6 +226,7 @@ class PercolatorAdapter : return scan_identifier.removeWhitespaces(); } + // TODO: add file specific scan numbers Int getScanNumber_(String scan_identifier) { Size idx = 0; @@ -236,8 +240,173 @@ class PercolatorAdapter : } return scan_identifier.toInt(); } + + // Function adapted from Enzyme.h in Percolator converter + bool isEnz_(const char& n, const char& c, string& enz) + { + if (enz == "trypsin") + { + return ((n == 'K' || n == 'R') && c != 'P') || n == '-' || c == '-'; + } + else if (enz == "chymotrypsin") + { + return ((n == 'F' || n == 'W' || n == 'Y' || n == 'L') && c != 'P') || n == '-' || c == '-'; + } + else if (enz == "thermolysin") + { + return ((c == 'A' || c == 'F' || c == 'I' || c == 'L' || c == 'M' + || c == 'V' || (n == 'R' && c == 'G')) && n != 'D' && n != 'E') || n == '-' || c == '-'; + } + else if (enz == "proteinasek") + { + return (n == 'A' || n == 'E' || n == 'F' || n == 'I' || n == 'L' + || n == 'T' || n == 'V' || n == 'W' || n == 'Y') || n == '-' || c == '-'; + } + else if (enz == "pepsin") + { + return ((c == 'F' || c == 'L' || c == 'W' || c == 'Y' || n == 'F' + || n == 'L' || n == 'W' || n == 'Y') && n != 'R') || n == '-' || c == '-'; + } + else if (enz == "elastase") + { + return ((n == 'L' || n == 'V' || n == 'A' || n == 'G') && c != 'P') + || n == '-' || c == '-'; + } + else if (enz == "lys-n") + { + return (c == 'K') + || n == '-' || c == '-'; + } + else if (enz == "lys-c") + { + return ((n == 'K') && c != 'P') + || n == '-' || c == '-'; + } + else if (enz == "arg-c") + { + return ((n == 'R') && c != 'P') + || n == '-' || c == '-'; + } + else if (enz == "asp-n") + { + return (c == 'D') + || n == '-' || c == '-'; + } + else if (enz == "glu-c") + { + return ((n == 'E') && (c != 'P')) + || n == '-' || c == '-'; + } + else + { + return true; + } + } + + // Function adapted from Enzyme.h in Percolator converter + Size countEnzymatic_(String peptide, string& enz) + { + Size count = 0; + for (Size ix = 1; ix < peptide.size(); ++ix) + { + if (isEnz_(peptide[ix - 1], peptide[ix], enz)) + { + ++count; + } + } + return count; + } - void readPoutAsMap_(String pout_file, Map >& pep_map) + //TODO for all prepare* PSMId as written in PeptideIdentification::spectrum_reference + // and pre/post AA as - if begin/end of protein ([/] in PeptideEvidence) - see prepareMULTIpin + //id label scannr feature1 ... featureN peptide proteinId1 .. proteinIdM + + void preparePin_(vector& peptide_ids, StringList& feature_set, std::string& enz, TextFile& txt, int min_charge, int max_charge) + { + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + { + String scan_identifier = getScanIdentifier_(it, peptide_ids.begin()); + Int scan_number = getScanNumber_(scan_identifier); + double exp_mass = it->getMZ(); + for (vector::const_iterator jt = it->getHits().begin(); jt != it->getHits().end(); ++jt) + { + PeptideHit hit(*jt); // make a copy of the hit to store temporary features + hit.setMetaValue("SpecId", scan_identifier); + hit.setMetaValue("ScanNr", scan_number); + + int label = 1; + if (hit.metaValueExists("target_decoy") && String(hit.getMetaValue("target_decoy")).hasSubstring("decoy")) + { + label = -1; + } + hit.setMetaValue("Label", label); + + int charge = hit.getCharge(); + String unmodified_sequence = hit.getSequence().toUnmodifiedString(); + + double calc_mass = hit.getSequence().getMonoWeight(Residue::Full, charge)/charge; + hit.setMetaValue("CalcMass", calc_mass); + + + hit.setMetaValue("ExpMass", exp_mass); + hit.setMetaValue("mass", exp_mass); + + double score = hit.getScore(); + hit.setMetaValue("score", score); + + int peptide_length = unmodified_sequence.size(); + hit.setMetaValue("peplen", peptide_length); + + for (int i = min_charge; i <= max_charge; ++i) + { + hit.setMetaValue("charge" + String(i), charge == i); + } + + bool enzN = isEnz_(hit.getPeptideEvidences().front().getAABefore(), unmodified_sequence.prefix(1)[0], enz); + hit.setMetaValue("enzN", enzN); + bool enzC = isEnz_(unmodified_sequence.suffix(1)[0], hit.getPeptideEvidences().front().getAAAfter(), enz); + hit.setMetaValue("enzC", enzC); + int enzInt = countEnzymatic_(unmodified_sequence, enz); + hit.setMetaValue("enzInt", enzInt); + + double delta_mass = exp_mass - calc_mass; + hit.setMetaValue("dm", delta_mass); + + double abs_delta_mass = abs(delta_mass); + hit.setMetaValue("absdm", abs_delta_mass); + + //peptide + String sequence = ""; + sequence += String(hit.getPeptideEvidences().front().getAABefore()); // just first peptide evidence + sequence += "." + hit.getSequence().toString() + "."; + sequence += String(hit.getPeptideEvidences().front().getAAAfter()); //just first peptide evidence + hit.setMetaValue("Peptide", sequence); + + //proteinId1 + StringList proteins; + for (vector::const_iterator kt = hit.getPeptideEvidences().begin(); kt != hit.getPeptideEvidences().end(); ++kt) + { + proteins.push_back(kt->getProteinAccession()); + } + hit.setMetaValue("Proteins", ListUtils::concatenate(proteins, '\t')); + + StringList feats; + for (vector::const_iterator feat = feature_set.begin(); feat != feature_set.end(); ++feat) + { + // Some Hits have no NumMatchedMainIons, and MeanError, etc. values. Have to ignore them! + if (hit.metaValueExists(*feat)) + { + feats.push_back(hit.getMetaValue(*feat).toString()); + } + } + if (feats.size() == feature_set.size()) + { // only if all feats were present add + txt.addLine(ListUtils::concatenate(feats, '\t')); + } + } + } + } + void readPoutAsMap_(String pout_file, std::map >& pep_map) { CsvFile csv_file(pout_file, '\t'); StringList row; @@ -255,47 +424,14 @@ class PercolatorAdapter : } } - ExitCodes main_(int, const char**) + bool readInputFiles_(StringList in_list, vector& all_peptide_ids, vector& all_protein_ids, int& min_charge, int& max_charge, bool isDecoy) { - //------------------------------------------------------------- - // general variables and data to perform PercolatorAdapter - //------------------------------------------------------------- - vector peptide_ids; - vector protein_ids; - - //------------------------------------------------------------- - // parsing parameters - //------------------------------------------------------------- - const StringList in_list = getStringList_("in"); - const StringList in_decoy = getStringList_("in_decoy"); - LOG_DEBUG << "Input file (of target?): " << ListUtils::concatenate(in_list, ",") << " & " << ListUtils::concatenate(in_decoy, ",") << " (decoy)" << endl; - - const String percolator_executable(getStringOption_("percolator_executable")); - writeDebug_(String("Path to the percolator: ") + percolator_executable, 2); - if (percolator_executable.empty()) //TODO? - TOPPBase::findExecutable after registerInputFile_("percolator_executable"... ??? + bool found_decoys = false; + for (StringList::const_iterator fit = in_list.begin(); fit != in_list.end(); ++fit) { - writeLog_("No percolator executable specified. Aborting!"); - printUsage_(); - return ILLEGAL_PARAMETERS; - } - - const String mzid_out(getStringOption_("mzid_out")); - const String out(getStringOption_("out")); - if (mzid_out.empty() && out.empty()) - { - writeLog_("Fatal error: no output file given (parameter 'out' or 'mzid_out')"); - printUsage_(); - return ILLEGAL_PARAMETERS; - } - - //------------------------------------------------------------- - // read input - //------------------------------------------------------------- - vector > peptide_ids_list; - vector > protein_ids_list; - for (size_t i = 0; i < in_list.size(); ++i) - { - String in = in_list[i]; + vector peptide_ids; + vector protein_ids; + String in = *fit; FileHandler fh; FileTypes::Type in_type = fh.getType(in); if (in_type == FileTypes::IDXML) @@ -304,18 +440,11 @@ class PercolatorAdapter : } else if (in_type == FileTypes::MZIDENTML) { - LOG_WARN << "Converting from mzid: you might experience loss of information depending on the capabilities of the target format." << endl; + LOG_WARN << "Converting from mzid: possible loss of information depending on target format." << endl; MzIdentMLFile().load(in, protein_ids, peptide_ids); } //else catched by TOPPBase:registerInput being mandatory mzid or idxml - if (peptide_ids.empty()) - { - writeLog_("No or empty input file specified. Aborting!"); - printUsage_(); - return ILLEGAL_PARAMETERS; - } - //being paranoid about the presence of target decoy denominations, which are crucial to the percolator process for (vector::iterator pit = peptide_ids.begin(); pit != peptide_ids.end(); ++pit) { @@ -324,139 +453,134 @@ class PercolatorAdapter : // Some Hits have no NumMatchedMainIons, and MeanError, etc. values. Have to ignore them! if (!pht->metaValueExists("target_decoy")) { - if (!in_decoy.empty()) + if (isDecoy) { - pht->setMetaValue("target_decoy", "target"); + pht->setMetaValue("target_decoy", "decoy"); + found_decoys = true; } else { - writeLog_("No target decoy search results discrimination possible. Aborting!"); - printUsage_(); - return ILLEGAL_PARAMETERS; + pht->setMetaValue("target_decoy", "target"); } } + else if (pht->getMetaValue("target_decoy").toString().hasSubstring("decoy")) + { + found_decoys = true; + } + + if (pht->getCharge() > max_charge) + { + max_charge = pht->getCharge(); + } + if (pht->getCharge() < min_charge) + { + min_charge = pht->getCharge(); + } } } - peptide_ids_list.push_back(peptide_ids); - protein_ids_list.push_back(protein_ids); - } - - //------------------------------------------------------------- - // read more input if necessary - //------------------------------------------------------------- - if (!in_decoy.empty() && in_list.size() == 1) - { - vector decoy_peptide_ids; - vector decoy_protein_ids; - FileHandler fh; - FileTypes::Type in_decoy_type = fh.getType(in_decoy.front()); - if (in_decoy_type == FileTypes::IDXML) - { - IdXMLFile().load(in_decoy.front(), decoy_protein_ids, decoy_peptide_ids); - } - else if (in_decoy_type == FileTypes::MZIDENTML) - { - LOG_WARN << "Converting from mzid: you might experience loss of information depending on the capabilities of the target format." << endl; - MzIdentMLFile().load(in_decoy.front(), decoy_protein_ids, decoy_peptide_ids); - } - + //paranoia check if this comes from the same search engine! (only in the first proteinidentification of the first proteinidentifications vector vector) { - if (decoy_protein_ids.front().getSearchEngine() != protein_ids_list.front().front().getSearchEngine() ) + ProteinIdentification::SearchParameters all_search_parameters = all_protein_ids.front().getSearchParameters(); + ProteinIdentification::SearchParameters search_parameters = protein_ids.front().getSearchParameters(); + if (protein_ids.front().getSearchEngine() != all_protein_ids.front().getSearchEngine()) { - LOG_WARN << "Warning about differing SearchEngine between target and decoy run" << endl; + writeLog_("Input files are not all from the same search engine, use TOPP_PSMFeatureExtractor to merge results from different search engines if desired. Aborting!"); + return INCOMPATIBLE_INPUT_DATA; } - if (decoy_protein_ids.front().getScoreType() != protein_ids_list.front().front().getScoreType() ) + + bool identical_extra_features = true; + if (all_search_parameters.metaValueExists("extra_features")) { - LOG_WARN << "Warning about differing ScoreType between target and decoy run" << endl; - } - if (decoy_protein_ids.front().getPrimaryMSRunPath() != protein_ids_list.front().front().getPrimaryMSRunPath() ) - { - LOG_WARN << "Warning about differing SearchInput between target and decoy run" << endl; - } - if (decoy_protein_ids.front().getSearchParameters().digestion_enzyme != protein_ids_list.front().front().getSearchParameters().digestion_enzyme ) - { - LOG_WARN << "Warning about differing DigestionEnzyme between target and decoy run" << endl; - } - if (decoy_protein_ids.front().getSearchParameters().variable_modifications != protein_ids_list.front().front().getSearchParameters().variable_modifications ) - { - LOG_WARN << "Warning about differing VarMods between target and decoy run" << endl; - } - if (decoy_protein_ids.front().getSearchParameters().fixed_modifications != protein_ids_list.front().front().getSearchParameters().fixed_modifications ) - { - LOG_WARN << "Warning about differing FixMods between target and decoy run" << endl; - } - if (decoy_protein_ids.front().getSearchParameters().charges != protein_ids_list.front().front().getSearchParameters().charges ) - { - LOG_WARN << "Warning about differing SearchCharges between target and decoy run" << endl; - } - if (decoy_protein_ids.front().getSearchParameters().fragment_mass_tolerance != protein_ids_list.front().front().getSearchParameters().fragment_mass_tolerance ) - { - LOG_WARN << "Warning about differing FragTol between target and decoy run" << endl; - } - if (decoy_protein_ids.front().getSearchParameters().precursor_tolerance != protein_ids_list.front().front().getSearchParameters().precursor_tolerance ) - { - LOG_WARN << "Warning about differing PrecTol between target and decoy run" << endl; + StringList all_search_feature_list = ListUtils::create(all_search_parameters.getMetaValue("extra_features").toString()); + set all_search_feature_set(all_search_feature_list.begin(),all_search_feature_list.end()); + if (search_parameters.metaValueExists("extra_features")) + { + StringList search_feature_list = ListUtils::create(search_parameters.getMetaValue("extra_features").toString()); + set search_feature_set(search_feature_list.begin(), search_feature_list.end()); + identical_extra_features = (search_feature_set == all_search_feature_set); + } + else + { + identical_extra_features = false; + } } - } - - //being paranoid about the presence of target decoy denominations, which are crucial to the percolator process - for (vector::iterator pit = decoy_peptide_ids.begin(); pit != decoy_peptide_ids.end(); ++pit) - { - for (vector::iterator pht = pit->getHits().begin(); pht != pit->getHits().end(); ++pht) + if (!identical_extra_features) { - pht->setMetaValue("target_decoy", "decoy"); - //TODO what about proteins - internal target decoy handling is shitty - rework pls + writeLog_("Input files do not have the same set of extra features from TOPP_PSMFeatureExtractor. Aborting!"); + return INCOMPATIBLE_INPUT_DATA; } } - //TODO check overlap of ids in terms of spectrum id/reference - peptide_ids_list.front().insert( peptide_ids.end(), decoy_peptide_ids.begin(), decoy_peptide_ids.end() ); - protein_ids_list.front().insert( protein_ids.end(), decoy_protein_ids.begin(), decoy_protein_ids.end() ); - writeLog_("Using decoy hits from separate file."); - } - else - { - writeLog_("Using decoy hits from input id file. You did you use a target decoy search, did you?"); + all_peptide_ids.insert(all_peptide_ids.end(), peptide_ids.begin(), peptide_ids.end()); + all_protein_ids.insert(all_protein_ids.end(), protein_ids.begin(), protein_ids.end()); } - + return found_decoys; + } + + ExitCodes main_(int, const char**) + { + //------------------------------------------------------------- + // general variables and data to perform PercolatorAdapter + //------------------------------------------------------------- + vector all_peptide_ids; + vector all_protein_ids; //------------------------------------------------------------- - // extract search engine and prepare pin + // parsing parameters //------------------------------------------------------------- - String se = protein_ids_list.front().front().getSearchEngine(); - for (vector >::iterator pilit = protein_ids_list.begin(); pilit != protein_ids_list.end(); ++pilit) + const StringList in_list = getStringList_("in"); + const StringList in_decoy = getStringList_("in_decoy"); + LOG_DEBUG << "Input file (of target?): " << ListUtils::concatenate(in_list, ",") << " & " << ListUtils::concatenate(in_decoy, ",") << " (decoy)" << endl; + + const String percolator_executable(getStringOption_("percolator_executable")); + writeDebug_(String("Path to the percolator: ") + percolator_executable, 2); + if (percolator_executable.empty()) //TODO? - TOPPBase::findExecutable after registerInputFile_("percolator_executable"... ??? { - if (se != pilit->front().getSearchEngine()) - { - se = "multiple"; - break; - } + writeLog_("No percolator executable specified. Aborting!"); + printUsage_(); + return ILLEGAL_PARAMETERS; + } + + const String mzid_out(getStringOption_("mzid_out")); + const String out(getStringOption_("out")); + if (mzid_out.empty() && out.empty()) + { + writeLog_("Fatal error: no output file given (parameter 'out' or 'mzid_out')"); + printUsage_(); + return ILLEGAL_PARAMETERS; } - LOG_DEBUG << "Registered search engine: " << se << endl; - TextFile txt; + //------------------------------------------------------------- + // read input + //------------------------------------------------------------- + //TODO introduce min/max charge to parameters for now take available range int max_charge = 0; int min_charge = 10; - for (vector >::iterator pilit = peptide_ids_list.begin(); pilit != peptide_ids_list.end(); ++pilit) + bool found_decoys = readInputFiles_(in_list, all_peptide_ids, all_protein_ids, min_charge, max_charge, false); + if (!in_decoy.empty()) { - for (vector::iterator it = pilit->begin(); it != pilit->end(); ++it) - { - for (vector::const_iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) - { - if (hit->getCharge() > max_charge) - { - max_charge = hit->getCharge(); - } - if (hit->getCharge() < min_charge) - { - min_charge = hit->getCharge(); - } - } - } + found_decoys |= readInputFiles_(in_decoy, all_peptide_ids, all_protein_ids, min_charge, max_charge, true); } LOG_DEBUG << "Using min/max charges of " << min_charge << "/" << max_charge << endl; + + if (!found_decoys) + { + writeLog_("No decoys found, search results discrimination impossible. Aborting!"); + printUsage_(); + return INCOMPATIBLE_INPUT_DATA; + } + + if (all_peptide_ids.empty()) + { + writeLog_("No hits found in input file. Aborting!"); + printUsage_(); + return INPUT_FILE_EMPTY; + } + //------------------------------------------------------------- + // prepare pin + //------------------------------------------------------------- StringList feature_set; feature_set.push_back("SpecId"); @@ -465,7 +589,6 @@ class PercolatorAdapter : feature_set.push_back("ExpMass"); feature_set.push_back("CalcMass"); feature_set.push_back("mass"); - feature_set.push_back("score"); feature_set.push_back("peplen"); for (int i = min_charge; i <= max_charge; ++i) { @@ -476,13 +599,21 @@ class PercolatorAdapter : feature_set.push_back("enzInt"); feature_set.push_back("dm"); feature_set.push_back("absdm"); + + ProteinIdentification::SearchParameters search_parameters = all_protein_ids.front().getSearchParameters(); + if (search_parameters.metaValueExists("extra_features")) + { + StringList extra_feature_set = ListUtils::create(search_parameters.getMetaValue("extra_features").toString()); + feature_set.insert(feature_set.end(), extra_feature_set.begin(), extra_feature_set.end()); + } else { + feature_set.push_back("score"); + } + feature_set.push_back("Peptide"); feature_set.push_back("Proteins"); - string enz_str = getStringOption_("enzyme"); - txt.addLine(ListUtils::concatenate(feature_set, '\t')); - TopPerc::preparePin(peptide_ids_list.front(), feature_set, enz_str, txt, min_charge, max_charge); - + string enz_str = getStringOption_("enzyme"); + // create temp directory to store percolator in file pin.tab temporarily String temp_directory_body = QDir::toNativeSeparators((File::getTempDirectory() + "/" + File::getUniqueName() + "/").toQString()); // body for the tmp files { @@ -493,6 +624,10 @@ class PercolatorAdapter : String pin_file(temp_directory_body + txt_designator + "_pin.tab"); String pout_target_file(temp_directory_body + txt_designator + "_target_pout.tab"); String pout_decoy_file(temp_directory_body + txt_designator + "_decoy_pout.tab"); + + TextFile txt; + txt.addLine(ListUtils::concatenate(feature_set, '\t')); + preparePin_(all_peptide_ids, feature_set, enz_str, txt, min_charge, max_charge); txt.store(pin_file); QStringList arguments; @@ -571,26 +706,27 @@ class PercolatorAdapter : //------------------------------------------------------------- // when percolator finished calculation, it stores the results -r option (with or without -U) or -m (which seems to be not working) // WARNING: The -r option cannot be used in conjunction with -U: no peptide level statistics are calculated, redirecting PSM level statistics to provided file instead. - Map > pep_map; + map > pep_map; readPoutAsMap_(pout_target_file, pep_map); readPoutAsMap_(pout_decoy_file, pep_map); // As the percolator output file is not needed anymore, the temporary directory is going to be deleted - if (this->debug_level_ < 99) + if (this->debug_level_ < 5) { File::removeDirRecursively(temp_directory_body); + LOG_WARN << "Set debug level to >=5 to keep the temporary files at '" << temp_directory_body << "'" << endl; } else { - LOG_WARN << "Keeping the temporary files at '" << temp_directory_body << "'. Set debug level to <99 to remove them." << endl; + LOG_WARN << "Keeping the temporary files at '" << temp_directory_body << "'. Set debug level to <5 to remove them." << endl; } // Add the percolator results to the peptide vector of the original input file size_t c_debug = 0; size_t cnt = 0; - for (vector::iterator it = peptide_ids_list.front().begin(); it != peptide_ids_list.front().end(); ++it) + for (vector::iterator it = all_peptide_ids.begin(); it != all_peptide_ids.end(); ++it) { - String scan_identifier = getScanIdentifier_(it, peptide_ids_list.front().begin()); + String scan_identifier = getScanIdentifier_(it, all_peptide_ids.begin()); if (pep_map.find(scan_identifier) == pep_map.end()) { ++c_debug; @@ -616,31 +752,48 @@ class PercolatorAdapter : } } } - LOG_INFO << "No suitable PeptideIdentification for " << c_debug << " out of " << peptide_ids_list.front().size() << endl; + LOG_INFO << "No suitable PeptideIdentification for " << c_debug << " out of " << all_peptide_ids.size() << endl; LOG_INFO << "Suitable PeptideHits for " << cnt << " found." << endl; - for (vector::iterator it = protein_ids_list.front().begin(); it != protein_ids_list.front().end(); ++it) + for (vector::iterator it = all_protein_ids.begin(); it != all_protein_ids.end(); ++it) { - //will not be set because ALL decoy hits got no new score - //it->setSearchEngine("Percolator"); - //it->setScoreType("q-value"); - //it->setHigherScoreBetter(false); + it->setSearchEngine("Percolator"); + it->setScoreType("q-value"); + it->setHigherScoreBetter(false); //TODO add software percolator and PercolatorAdapter it->setMetaValue("percolator", "PercolatorAdapter"); - ProteinIdentification::SearchParameters sp = it->getSearchParameters(); - //TODO write all percolator parameters as set here in sp - it->setSearchParameters(sp); + ProteinIdentification::SearchParameters search_parameters = it->getSearchParameters(); + + search_parameters.setMetaValue("Percolator:cpos", getDoubleOption_("cpos")); + search_parameters.setMetaValue("Percolator:cneg", getDoubleOption_("cneg")); + search_parameters.setMetaValue("Percolator:testFDR", getDoubleOption_("testFDR")); + search_parameters.setMetaValue("Percolator:trainFDR", getDoubleOption_("trainFDR")); + search_parameters.setMetaValue("Percolator:quick-validation", getFlag_("quick-validation")); + search_parameters.setMetaValue("Percolator:weights", getStringOption_("weights")); + search_parameters.setMetaValue("Percolator:init-weights", getStringOption_("init-weights")); + search_parameters.setMetaValue("Percolator:default-direction", getStringOption_("default-direction")); + search_parameters.setMetaValue("Percolator:unitnorm", getFlag_("unitnorm")); + search_parameters.setMetaValue("Percolator:override", getFlag_("override")); + search_parameters.setMetaValue("Percolator:seed", getIntOption_("seed")); + search_parameters.setMetaValue("Percolator:doc", getIntOption_("doc")); + search_parameters.setMetaValue("Percolator:klammer", getFlag_("klammer")); + search_parameters.setMetaValue("Percolator:picked-protein", getFlag_("picked-protein")); + search_parameters.setMetaValue("Percolator:fasta", getStringOption_("fasta")); + search_parameters.setMetaValue("Percolator:decoy-pattern", getStringOption_("decoy-pattern")); + search_parameters.setMetaValue("Percolator:post-processing-tdc", getFlag_("post-processing-tdc")); + + it->setSearchParameters(search_parameters); } // Storing the PeptideHits with calculated q-value, pep and svm score if (!mzid_out.empty()) { - MzIdentMLFile().store(mzid_out.toQString().toStdString(), protein_ids_list.front(), peptide_ids_list.front()); + MzIdentMLFile().store(mzid_out.toQString().toStdString(), all_protein_ids, all_peptide_ids); } if (!out.empty()) { - IdXMLFile().store(out.toQString().toStdString(), protein_ids_list.front(), peptide_ids_list.front()); + IdXMLFile().store(out.toQString().toStdString(), all_protein_ids, all_peptide_ids); } writeLog_("PercolatorAdapter finished successfully!"); diff --git a/src/utils/PSMFeatureExtractor.cpp b/src/utils/PSMFeatureExtractor.cpp new file mode 100644 index 00000000000..4299ab9827d --- /dev/null +++ b/src/utils/PSMFeatureExtractor.cpp @@ -0,0 +1,289 @@ +// -------------------------------------------------------------------------- +// OpenMS -- Open-Source Mass Spectrometry +// -------------------------------------------------------------------------- +// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, +// ETH Zurich, and Freie Universitaet Berlin 2002-2015. +// +// This software is released under a three-clause BSD license: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of any author or any participating institution +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// For a full list of authors, refer to the file AUTHORS. +// -------------------------------------------------------------------------- +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING +// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// -------------------------------------------------------------------------- +// $Maintainer: Mathias Walzer $ +// $Authors: Andreas Simon, Mathias Walzer, Matthew The $ +// -------------------------------------------------------------------------- +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +using namespace OpenMS; +using namespace std; + +//------------------------------------------------------------- +//Doxygen docu +//------------------------------------------------------------- + +/** + @page TOPP_PSMFeatureExtractor PSMFeatureExtractor + + @brief PSMFeatureExtractor computes extra features for each input PSM + + @experimental This tool is work in progress and usage and input requirements might change. + +
+ + + + + + + + + +
potential predecessor tools \f$ \longrightarrow \f$ MSGF+\f$ \longrightarrow \f$ potential successor tools
@ref TOPP_PercolatorAdapter
+
+ +

PSMFeatureExtractor is search engine sensitive, i.e. it's extra features vary, depending on the search engine.

+ + The command line parameters of this tool are: + @verbinclude TOPP_PSMFeatureExtractor.cli + INI file documentation of this tool: + @htmlinclude TOPP_PSMFeatureExtractor.html +*/ + +// We do not want this class to show up in the docu: +/// @cond TOPPCLASSES + + +class PSMFeatureExtractor : + public TOPPBase +{ +public: + PSMFeatureExtractor() : + TOPPBase("PSMFeatureExtractor", "Computes extra features for each input PSM.", false) + { + } + +protected: + void registerOptionsAndFlags_() + { + registerInputFileList_("in", "", StringList(), "Input file(s)", true); + setValidFormats_("in", ListUtils::create("mzid,idXML")); + registerOutputFile_("out", "", "", "Output file in idXML format", false); + registerOutputFile_("mzid_out", "", "", "Output file in mzid format", false); + registerFlag_("multiple_search_engines", "Combine PSMs from different search engines by merging on scan level."); + + registerFlag_("MHC", "Add a feature for MHC ligand properties to the specific PSM.", true); + registerFlag_("override_db_check", "Manual override to check if same settings for multiple search engines were applied.", true); + registerFlag_("concat", "Naive merging of PSMs from different search engines: concatenate multiple search results instead of merging on scan level. Only valid together wtih -multiple_search_engines flag.", true); + } + + ExitCodes main_(int, const char**) + { + //------------------------------------------------------------- + // general variables and data to perform PSMFeatureExtractor + //------------------------------------------------------------- + vector all_peptide_ids; + vector all_protein_ids; + + //------------------------------------------------------------- + // parsing parameters + //------------------------------------------------------------- + const StringList in_list = getStringList_("in"); + LOG_DEBUG << "Input file (of target?): " << ListUtils::concatenate(in_list, ",") << endl; + + const String mzid_out(getStringOption_("mzid_out")); + const String out(getStringOption_("out")); + if (mzid_out.empty() && out.empty()) + { + writeLog_("Fatal error: no output file given (parameter 'out' or 'mzid_out')"); + printUsage_(); + return ILLEGAL_PARAMETERS; + } + + //------------------------------------------------------------- + // read input + //------------------------------------------------------------- + bool multiple_search_engines = getFlag_("multiple-search-engines"); + bool override_db_check = getFlag_("override_db_check"); + bool concatenate = getFlag_("concat"); + StringList search_engines_used; + for (StringList::const_iterator fit = in_list.begin(); fit != in_list.end(); ++fit) + { + vector peptide_ids; + vector protein_ids; + String in = *fit; + FileHandler fh; + FileTypes::Type in_type = fh.getType(in); + if (in_type == FileTypes::IDXML) + { + IdXMLFile().load(in, protein_ids, peptide_ids); + } + else if (in_type == FileTypes::MZIDENTML) + { + LOG_WARN << "Converting from mzid: possible loss of information depending on target format." << endl; + MzIdentMLFile().load(in, protein_ids, peptide_ids); + } + //else catched by TOPPBase:registerInput being mandatory mzid or idxml + + //paranoia check if this comes from the same search engine! (only in the first proteinidentification of the first proteinidentifications vector vector) + { + ProteinIdentification::SearchParameters all_search_parameters = all_protein_ids.front().getSearchParameters(); + ProteinIdentification::SearchParameters search_parameters = protein_ids.front().getSearchParameters(); + if (!multiple_search_engines && protein_ids.front().getSearchEngine() != all_protein_ids.front().getSearchEngine()) + { + writeLog_("Input files are not all from the same search engine, set -multiple_search_engines to allow this. Aborting!"); + return INCOMPATIBLE_INPUT_DATA; + } + + if (!override_db_check && search_parameters.db != all_search_parameters.db) + { + writeLog_("Input files are not searched with the same protein database, set -override_db_check flag to allow this. Aborting!"); + return INCOMPATIBLE_INPUT_DATA; + } + + if (protein_ids.front().getScoreType() != all_protein_ids.front().getScoreType() ) + { + LOG_WARN << "Warning: differing ScoreType between input files" << endl; + } + if (search_parameters.digestion_enzyme != all_search_parameters.digestion_enzyme ) + { + LOG_WARN << "Warning: differing DigestionEnzyme between input files" << endl; + } + if (search_parameters.variable_modifications != all_search_parameters.variable_modifications ) + { + LOG_WARN << "Warning: differing VarMods between input files" << endl; + } + if (search_parameters.fixed_modifications != all_search_parameters.fixed_modifications ) + { + LOG_WARN << "Warning: differing FixMods between input files" << endl; + } + if (search_parameters.charges != all_search_parameters.charges ) + { + LOG_WARN << "Warning: differing SearchCharges between input files" << endl; + } + if (search_parameters.fragment_mass_tolerance != all_search_parameters.fragment_mass_tolerance ) + { + LOG_WARN << "Warning: differing FragTol between input files" << endl; + } + if (search_parameters.precursor_tolerance != all_search_parameters.precursor_tolerance ) + { + LOG_WARN << "Warning: differing PrecTol between input files" << endl; + } + } + + if (!multiple_search_engines) + { + all_peptide_ids.insert(all_peptide_ids.end(), peptide_ids.begin(), peptide_ids.end()); + } + else if (concatenate) + { + TopPerc::concatMULTISEids(all_protein_ids, all_peptide_ids, protein_ids, peptide_ids, search_engines_used); + } + else + { + // will collapse the list (reference) + TopPerc::mergeMULTISEids(all_protein_ids, all_peptide_ids, protein_ids, peptide_ids, search_engines_used); + } + } + + //------------------------------------------------------------- + // extract search engine and prepare pin + //------------------------------------------------------------- + String search_engine = all_protein_ids.front().getSearchEngine(); + if (multiple_search_engines) search_engine = "multiple"; + LOG_DEBUG << "Registered search engine: " << search_engine << endl; + + TextFile txt; + + StringList feature_set; + if (search_engine == "multiple") + { + if (getFlag_("concat")) + { + TopPerc::addCONCATSEFeatures(all_peptide_ids, search_engines_used, feature_set); + } + else + { + TopPerc::addMULTISEFeatures(all_peptide_ids, search_engines_used, feature_set); + } + } + //TODO introduce custom feature selection from TopPerc::prepareCUSTOMpin to parameters + else if (search_engine == "MS-GF+") TopPerc::addMSGFFeatures(all_peptide_ids, feature_set); + else if (search_engine == "Mascot") TopPerc::addMASCOTFeatures(all_peptide_ids, feature_set); + else if (search_engine == "XTandem") TopPerc::addXTANDEMFeatures(all_peptide_ids, feature_set); + else if (search_engine == "Comet") TopPerc::addCOMETFeatures(all_peptide_ids, feature_set); + else + { + writeLog_("No known input to create PSM features from. Aborting"); + return INCOMPATIBLE_INPUT_DATA; + } + + for (vector::iterator it = all_protein_ids.begin(); it != all_protein_ids.end(); ++it) + { + ProteinIdentification::SearchParameters search_parameters = it->getSearchParameters(); + + search_parameters.setMetaValue("feature_extractor", "TOPP_PSMFeatureExtractor"); + search_parameters.setMetaValue("extra_features", ListUtils::concatenate(feature_set, ",")); + it->setSearchParameters(search_parameters); + } + + // Storing the PeptideHits with calculated q-value, pep and svm score + if (!mzid_out.empty()) + { + MzIdentMLFile().store(mzid_out.toQString().toStdString(), all_protein_ids, all_peptide_ids); + } + if (!out.empty()) + { + IdXMLFile().store(out.toQString().toStdString(), all_protein_ids, all_peptide_ids); + } + + writeLog_("PSMFeatureExtractor finished successfully!"); + return EXECUTION_OK; + } + +}; + + +int main(int argc, const char** argv) +{ + PSMFeatureExtractor tool; + + return tool.main(argc, argv); +} + +/// @endcond diff --git a/src/utils/TopPerc.cpp b/src/utils/TopPerc.cpp deleted file mode 100644 index 36bb79295c1..00000000000 --- a/src/utils/TopPerc.cpp +++ /dev/null @@ -1,583 +0,0 @@ -// -------------------------------------------------------------------------- -// OpenMS -- Open-Source Mass Spectrometry -// -------------------------------------------------------------------------- -// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, -// ETH Zurich, and Freie Universitaet Berlin 2002-2016. -// -// This software is released under a three-clause BSD license: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of any author or any participating institution -// may be used to endorse or promote products derived from this software -// without specific prior written permission. -// For a full list of authors, refer to the file AUTHORS. -// -------------------------------------------------------------------------- -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING -// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF -// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// -------------------------------------------------------------------------- -// $Maintainer: Mathias Walzer $ -// $Authors: Andreas Simon, Mathias Walzer $ -// -------------------------------------------------------------------------- -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -using namespace OpenMS; -using namespace std; - -//------------------------------------------------------------- -//Doxygen docu -//------------------------------------------------------------- - -/** - @page TOPP_TopPerc TopPerc - - @brief TopPerc facilitates the input to, the call of and output integration of Percolator. - Percolator (http://per-colator.com/) is a tool to apply semi-supervised learning for peptide - identification from shotgun proteomics datasets. - - @experimental This tool is work in progress and usage and input requirements might change. - -
- - - - - - - - - - -
potential predecessor tools \f$ \longrightarrow \f$ MSGF+\f$ \longrightarrow \f$ potential successor tools
@ref TOPP_IDFilter @ref TOPP_IDMapper
-
- -

Percolator is search engine sensitive, i.e. it's input features vary, depending on the search engine.

- - The command line parameters of this tool are: - @verbinclude TOPP_TopPerc.cli - INI file documentation of this tool: - @htmlinclude TOPP_TopPerc.html - - Percolator is written by Lukas Käll (http://per-colator.com/ Copyright Lukas Käll ) -*/ - -// We do not want this class to show up in the docu: -/// @cond TOPPCLASSES - - -class TOPPPercolator : - public TOPPBase -{ -public: - TOPPPercolator() : - TOPPBase("TopPerc", "Facilitate input to Percolator and reintegrate.", false) - { - } - -protected: - void registerOptionsAndFlags_() - { - registerInputFileList_("in", "", StringList(), "Input file(s)", true); - setValidFormats_("in", ListUtils::create("mzid,idXML")); - registerInputFileList_("in_decoy", "", StringList(), "Input decoy file(s)", false); - setValidFormats_("in_decoy", ListUtils::create("mzid,idXML")); - registerOutputFile_("out", "", "", "Output file in idXML format", true); - registerOutputFile_("mzid_out", "", "", "Output file in mzid format", true); - std::string enzs = "no_enzyme,elastase,pepsin,proteinasek,thermolysin,chymotrypsin,lys-n,lys-c,arg-c,asp-n,glu-c,trypsin"; - registerStringOption_("enzyme", "", "trypsin", "Type of enzyme: "+enzs , false); - setValidStrings_("enzyme", ListUtils::create(enzs)); - registerInputFile_("percolator_executable", "", - // choose the default value according to the platform where it will be executed - #ifdef OPENMS_WINDOWSPLATFORM - "percolator.exe", - #else - "percolator", - #endif - "Percolator executable of the installation e.g. 'percolator.exe'", true, false, ListUtils::create("skipexists") - ); - - //Advanced parameters - //registerOutputFile_("r", "", "out", "Output tab delimited results to a file instead of stdout", false, true); - //registerOutputFile_("B", "", "", "Output tab delimited results for decoys into a file", false, true); - //registerOutputFile_("X", "", "", "path to file in xml-output format (pout). Default is: pout.tab", false, true); - //registerFlag_("e", "read xml-input format (pin) from standard input", true); - //registerFlag_("Z", "Include decoys (PSMs, peptides and/or proteins) in the xml-output. Only available if -X is used.", true); - registerDoubleOption_("p", "", 0.0, "Cpos, penalty for mistakes made on positive examples. Set by cross validation if not specified.", false, true); - registerDoubleOption_("n", "", 0.0, "Cneg, penalty for mistakes made on negative examples. Set by cross validation if not specified.", false, true); - registerDoubleOption_("F", "", 0.01, "False discovery rate threshold to define positive examples in training. Set by cross validation if 0. Default is 0.01.", false, true); - registerDoubleOption_("t", "", 0.01, "False discovery rate threshold for evaluating best cross validation result and the reported end result. Default is 0.01.", false, true); - registerIntOption_("i", "", 0, "Maximal number of iterations", false, true); - registerFlag_("x", "Quicker execution by reduced internal cross-validation.", true); - //registerDoubleOption_("f", "", 0.6, "Fraction of the negative data set to be used as train set when only providing one negative set, remaining examples will be used as test set. Set to 0.6 by default.", false, true); - //registerOutputFile_("J", "", "", "Output the computed features to the given file in tab-delimited format. A file with the features with the given file name will be created", false, true); - //registerInputFile_("k", "", "", "Input file given in the deprecated pin-xml format generated by e.g. sqt2pin with the -k option", false, true); - registerOutputFile_("w", "", "", "Output final weights to the given file", false, true); - registerInputFile_("W", "", "", "Read initial weights to the given file", false, true); - registerStringOption_("V", "", "", "The most informative feature given as the feature name, can be negated to indicate that a lower value is better.", false, true); - registerIntOption_("v", "", 2, "Set verbosity of output: 0=no processing info, 5=all, default is 2", false, true); - registerFlag_("u", "Use unit normalization [0-1] instead of standard deviation normalization", true); - registerFlag_("R", "Measure performance on test set each iteration", true); - registerFlag_("O", "Override error check and do not fall back on default score vector in case of suspect score vector", true); - registerIntOption_("S", "", 1, "Setting seed of the random number generator. Default value is 1", false, true); - registerFlag_("K", "Retention time features calculated as in Klammer et al.", true); - registerFlag_("D", "Include description of correct features", true); - registerFlag_("U", "Do not remove redundant peptides, keep all PSMS and exclude peptide level probabilities.", true); - //registerFlag_("s", "skip validation of input file against xml schema", true); - //registerFlag_("A", "output protein level probabilities", true); - //registerDoubleOption_("a", "", 0.0, "Probability with which a present protein emits an associated peptide (to be used jointly with the -A option). Set by grid search if not specified.", false, true); - //registerDoubleOption_("b", "", 0.0, "Probability of the creation of a peptide from noise (to be used jointly with the -A option). Set by grid search if not specified", false, true); - //registerDoubleOption_("G", "", 0.0, "Prior probability of that a protein is present in the sample ( to be used with the -A option). Set by grid search if not specified", false, true); - //registerFlag_("g", "treat ties as if it were one protein (Only valid if option -A is active).", true); - //registerFlag_("I", "use pi_0 value when calculating empirical q-values (no effect if option Q is activated) (Only valid if option -A is active).", true); - //registerFlag_("q", "output empirical q-values and p-values (from target-decoy analysis) (Only valid if option -A is active).", true); - //registerFlag_("N", "disactivates the grouping of proteins with similar connectivity, for example if proteins P1 and P2 have the same peptides matching both of them, P1 and P2 will not be grouped as one protein (Only valid if option -A is active).", true); - //registerFlag_("E", "Proteins graph will not be separated in sub-graphs (Only valid if option -A is active).", true); - //registerFlag_("C", "it does not prune peptides with a very low score (~0.0) which means that if a peptide with a very low score is matching two proteins, when we prune the peptide,it will be duplicated to generate two new protein groups (Only valid if option -A is active).", true); - //registerIntOption_("d", "", 0, "Setting depth 0 or 1 or 2 from low depth to high depth(less computational time) of the grid search for the estimation Alpha,Beta and Gamma parameters for fido(Only valid if option -A is active). Default value is 0", false, true); - registerStringOption_("P", "", "random", "Define the text pattern to identify the decoy proteins and/or PSMs, set this up if the label that identifies the decoys in the database is not the default (by default : random) (Only valid if option -A is active).", false, true); - //registerFlag_("T", "Reduce the tree of proteins (removing low scored proteins) in order to estimate alpha,beta and gamma faster.(Only valid if option -A is active).", true); - registerFlag_("Y", "Use target decoy competition to compute peptide probabilities.(recommended when using -A).", true); - //registerFlag_("H", "Q-value threshold that will be used in the computation of the MSE and ROC AUC score in the grid search (recommended 0.05 for normal size datasets and 0.1 for big size datasets).(Only valid if option -A is active).", true); - //registerFlag_("fido-truncation", "Proteins with a very low score (< 0.001) will be truncated (assigned 0.0 probability).(Only valid if option -A is active)", true); - //registerFlag_("Q", "Uses protein group level inference, each cluster of proteins is either present or not, therefore when grouping proteins discard all possible combinations for each group.(Only valid if option -A is active and -N is inactive).", true); - registerFlag_("MHC", "Add a feature for MHC ligand properties to the specific PSM.", true); - registerFlag_("same_search_db", "Manual override to ckeck if same settings for multiple search engines were applied.", true); - registerFlag_("concat", "Manual override to concatenate multiple search results instead of merging on scan level.", true); - } - - ExitCodes main_(int, const char**) - { - //------------------------------------------------------------- - // general variables and data to perform TopPerc - //------------------------------------------------------------- - vector peptide_ids; - vector protein_ids; - - //------------------------------------------------------------- - // parsing parameters - //------------------------------------------------------------- - const StringList in_list = getStringList_("in"); - const StringList in_decoy = getStringList_("in_decoy"); - LOG_DEBUG << "Input file (of target?): " << ListUtils::concatenate(in_list, ",") << " & " << ListUtils::concatenate(in_decoy, ",") << " (decoy)" << endl; - - const String percolator_executable(getStringOption_("percolator_executable")); - writeDebug_(String("Path to the percolator: ") + percolator_executable, 2); - if (percolator_executable.empty()) //TODO? - TOPPBase::findExecutable after registerInputFile_("percolator_executable"... ??? - { - writeLog_("No percolator executable specified. Aborting!"); - printUsage_(); - return ILLEGAL_PARAMETERS; - } - - const String mzid_out(getStringOption_("mzid_out")); - const String out(getStringOption_("out")); - if (mzid_out.empty() && out.empty()) - { - writeLog_("Fatal error: no output file given (parameter 'out' or 'mzid_out')"); - printUsage_(); - return ILLEGAL_PARAMETERS; - } - - //------------------------------------------------------------- - // read input - //------------------------------------------------------------- - vector > peptide_ids_list; - vector > protein_ids_list; - for (size_t i = 0; i < in_list.size(); ++i) - { - String in = in_list[i]; - FileHandler fh; - FileTypes::Type in_type = fh.getType(in); - if (in_type == FileTypes::IDXML) - { - IdXMLFile().load(in, protein_ids, peptide_ids); - } - else if (in_type == FileTypes::MZIDENTML) - { - LOG_WARN << "Converting from mzid: you might experience loss of information depending on the capabilities of the target format." << endl; - MzIdentMLFile().load(in, protein_ids, peptide_ids); - } - //else catched by TOPPBase:registerInput being mandatory mzid or idxml - - if (peptide_ids.empty()) - { - writeLog_("No or empty input file specified. Aborting!"); - printUsage_(); - return ILLEGAL_PARAMETERS; - } - - //being paranoid about the presence of target decoy denominations, which are crucial to the percolator process - for (std::vector::iterator pit = peptide_ids.begin(); pit != peptide_ids.end(); ++pit) - { - for (vector::iterator pht = pit->getHits().begin(); pht != pit->getHits().end(); ++pht) - { - // Some Hits have no NumMatchedMainIons, and MeanError, etc. values. Have to ignore them! - if (!pht->metaValueExists("target_decoy")) - { - if (!in_decoy.empty()) - { - pht->setMetaValue("target_decoy", "target"); - } - else - { - writeLog_("No target decoy search results discrimination possible. Aborting!"); - printUsage_(); - return ILLEGAL_PARAMETERS; - } - } - } - } - peptide_ids_list.push_back(peptide_ids); - protein_ids_list.push_back(protein_ids); - } - - //------------------------------------------------------------- - // read more input if necessary - //------------------------------------------------------------- - if (!in_decoy.empty() && in_list.size() == 1) - { - vector decoy_peptide_ids; - vector decoy_protein_ids; - FileHandler fh; - FileTypes::Type in_decoy_type = fh.getType(in_decoy.front()); - if (in_decoy_type == FileTypes::IDXML) - { - IdXMLFile().load(in_decoy.front(), decoy_protein_ids, decoy_peptide_ids); - } - else if (in_decoy_type == FileTypes::MZIDENTML) - { - LOG_WARN << "Converting from mzid: you might experience loss of information depending on the capabilities of the target format." << endl; - MzIdentMLFile().load(in_decoy.front(), decoy_protein_ids, decoy_peptide_ids); - } - - //paranoia check if this comes from the same search engine! (only in the first proteinidentification of the first proteinidentifications vector vector) - { - if (decoy_protein_ids.front().getSearchEngine() != protein_ids_list.front().front().getSearchEngine() ) - { - LOG_WARN << "Warning about differing SearchEngine between target and decoy run" << endl; - } - if (decoy_protein_ids.front().getScoreType() != protein_ids_list.front().front().getScoreType() ) - { - LOG_WARN << "Warning about differing SoreType between target and decoy run" << endl; - } - if (decoy_protein_ids.front().getPrimaryMSRunPath() != protein_ids_list.front().front().getPrimaryMSRunPath() ) - { - LOG_WARN << "Warning about differing SearchInput between target and decoy run" << endl; - } - if (decoy_protein_ids.front().getSearchParameters().digestion_enzyme != protein_ids_list.front().front().getSearchParameters().digestion_enzyme ) - { - LOG_WARN << "Warning about differing DigestionEnzyme between target and decoy run" << endl; - } - if (decoy_protein_ids.front().getSearchParameters().variable_modifications != protein_ids_list.front().front().getSearchParameters().variable_modifications ) - { - LOG_WARN << "Warning about differing VarMods between target and decoy run" << endl; - } - if (decoy_protein_ids.front().getSearchParameters().fixed_modifications != protein_ids_list.front().front().getSearchParameters().fixed_modifications ) - { - LOG_WARN << "Warning about differing FixMods between target and decoy run" << endl; - } - if (decoy_protein_ids.front().getSearchParameters().charges != protein_ids_list.front().front().getSearchParameters().charges ) - { - LOG_WARN << "Warning about differing SearchCharges between target and decoy run" << endl; - } - if (decoy_protein_ids.front().getSearchParameters().fragment_mass_tolerance != protein_ids_list.front().front().getSearchParameters().fragment_mass_tolerance ) - { - LOG_WARN << "Warning about differing FragTol between target and decoy run" << endl; - } - if (decoy_protein_ids.front().getSearchParameters().precursor_tolerance != protein_ids_list.front().front().getSearchParameters().precursor_tolerance ) - { - LOG_WARN << "Warning about differing PrecTol between target and decoy run" << endl; - } - } - - //being paranoid about the presence of target decoy denominations, which are crucial to the percolator process - for (std::vector::iterator pit = decoy_peptide_ids.begin(); pit != decoy_peptide_ids.end(); ++pit) - { - for (std::vector::iterator pht = pit->getHits().begin(); pht != pit->getHits().end(); ++pht) - { - pht->setMetaValue("target_decoy", "decoy"); - //TODO what about proteins - internal target decoy handling is shitty - rework pls - } - } - //TODO check overlap of ids in terms of spectrum id/reference - peptide_ids_list.front().insert( peptide_ids.end(), decoy_peptide_ids.begin(), decoy_peptide_ids.end() ); - protein_ids_list.front().insert( protein_ids.end(), decoy_protein_ids.begin(), decoy_protein_ids.end() ); - writeLog_("Using decoy hits from separate file."); - } - else - { - writeLog_("Using decoy hits from input id file. You did you use a target decoy search, did you?"); - } - - - //------------------------------------------------------------- - // extract search engine and prepare pin - //------------------------------------------------------------- - String se = protein_ids_list.front().front().getSearchEngine(); - for (vector >::iterator pilit = protein_ids_list.begin(); pilit != protein_ids_list.end(); ++pilit) - { - if (se != pilit->front().getSearchEngine()) - { - se = "multiple"; - break; - } - } - LOG_DEBUG << "Registered search engine: " << se << endl; - TextFile txt; - - //TODO introduce min/max charge to parameters for now take available range - int max_charge = 0; - int min_charge = 10; - for (vector >::iterator pilit = peptide_ids_list.begin(); pilit != peptide_ids_list.end(); ++pilit) - { - for (vector::iterator it = pilit->begin(); it != pilit->end(); ++it) - { - for (vector::const_iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) - { - if (hit->getCharge() > max_charge) - { - max_charge = hit->getCharge(); - } - if (hit->getCharge() < min_charge) - { - min_charge = hit->getCharge(); - } - } - } - } - LOG_DEBUG << "Using min/max charges of " << min_charge << "/" << max_charge << endl; - - string enz_str = getStringOption_("enzyme"); - - //ignore all but first input if NOT multiple for now - if (se == "multiple") - { - if (getFlag_("concat")) - { - LOG_DEBUG << "Concatenating " << protein_ids_list.size() << " and " << peptide_ids_list.size() << endl; - TopPerc::prepareCONCATpin(peptide_ids_list, protein_ids_list, enz_str, txt, min_charge, max_charge); - } - else - { - TopPerc::mergeMULTIids(protein_ids_list,peptide_ids_list, getFlag_("same_search_db")); // will collapse the list (reference) - LOG_DEBUG << "Merged to sizes " << protein_ids_list.size() << " and " << peptide_ids_list.size() << endl; - TopPerc::prepareMULTIpin(peptide_ids_list.front(), protein_ids_list.front().front(), enz_str, txt, min_charge, max_charge); - } - } - //TODO introduce custom feature selection from TopPerc::prepareCUSTOMpin to parameters - else if (se == "MS-GF+") TopPerc::prepareMSGFpin(peptide_ids_list.front(), enz_str, txt, min_charge, max_charge, getFlag_("MHC")); - else if (se == "Mascot") TopPerc::prepareMASCOTpin(peptide_ids_list.front(), enz_str, txt, min_charge, max_charge); - else if (se == "XTandem") TopPerc::prepareXTANDEMpin(peptide_ids_list.front(), enz_str, txt, min_charge, max_charge); - else - { - writeLog_("No known input to create percolator features from. Aborting"); - return INCOMPATIBLE_INPUT_DATA; - } - - // create temp directory to store percolator in file pin.tab temporarily - String temp_directory_body = QDir::toNativeSeparators((File::getTempDirectory() + "/" + File::getUniqueName() + "/").toQString()); // body for the tmp files - { - QDir d; - d.mkpath(temp_directory_body.toQString()); - } - String txt_designator = File::getUniqueName(); - String pin_file(temp_directory_body + txt_designator + "_pin.tab"); - String pout_target_file(temp_directory_body + txt_designator + "_target_pout.tab"); - String pout_decoy_file(temp_directory_body + txt_designator + "_decoy_pout.tab"); - txt.store(pin_file); - - QStringList arguments; - // Check all set parameters and get them into arguments StringList - { - arguments << "-U"; - arguments << "-m" << pout_target_file.toQString(); - arguments << "-M" << pout_decoy_file.toQString(); - //if (getFlag_("U")) arguments << "-U"; - //if (getFlag_("e")) arguments << "-e"; - //if (getFlag_("Z")) arguments << "-Z"; - if (getDoubleOption_("p") != 0.0) arguments << "-p" << String(getDoubleOption_("p")).toQString(); - if (getDoubleOption_("n") != 0.0) arguments << "-n" << String(getDoubleOption_("n")).toQString(); - if (getDoubleOption_("F") != 0.01) arguments << "-F" << String(getDoubleOption_("F")).toQString(); - if (getDoubleOption_("t") != 0.01) arguments << "-t" << String(getDoubleOption_("t")).toQString(); - if (getIntOption_("i") != 0) arguments << "-i" << String(getIntOption_("i")).toQString(); - if (getFlag_("x")) arguments << "-x"; - //if (getDoubleOption_("f") != 0.6) arguments << "-f" << String(getDoubleOption_("f")).toQString(); - //if (getStringOption_("J") != "") arguments << "-J" << getStringOption_("J").toQString(); - //if (getStringOption_("k") != "") arguments << "-k" << getStringOption_("k").toQString(); - if (getStringOption_("w") != "") arguments << "-w" << getStringOption_("w").toQString(); - if (getStringOption_("W") != "") arguments << "-W" << getStringOption_("W").toQString(); - if (getStringOption_("V") != "") arguments << "-V" << getStringOption_("V").toQString(); - if (getIntOption_("v") != 2) arguments << "-v" << String(getIntOption_("v")).toQString(); - if (getFlag_("u")) arguments << "-u"; - if (getFlag_("R")) arguments << "-R"; - if (getFlag_("O")) arguments << "-O"; - if (getIntOption_("S") != 1) arguments << "-S" << String(getDoubleOption_("S")).toQString(); - if (getFlag_("K")) arguments << "-K"; - if (getFlag_("D")) arguments << "-D"; - //if (getFlag_("s")) arguments << "-s"; - //if (getFlag_("A")) arguments << "-A"; - //if (getDoubleOption_("a") != 0.0) arguments << "-a" << String(getDoubleOption_("a")).toQString(); - //if (getDoubleOption_("b") != 0.0) arguments << "-b" << String(getDoubleOption_("b")).toQString(); - //if (getDoubleOption_("G") != 0.0) arguments << "-G" << String(getDoubleOption_("G")).toQString(); - //if (getFlag_("g")) arguments << "-g"; - //if (getFlag_("I")) arguments << "-I"; - //if (getFlag_("q")) arguments << "-q"; - //if (getFlag_("N")) arguments << "-N"; - //if (getFlag_("E")) arguments << "-E"; - //if (getFlag_("C")) arguments << "-C"; - //if (getIntOption_("d") != 0) arguments << "-d" << String(getIntOption_("d")).toQString(); - if (getStringOption_("P") != "random") arguments << "-P" << getStringOption_("P").toQString(); - //if (getFlag_("T")) arguments << "-T"; - if (getFlag_("Y")) arguments << "-Y"; - //if (getFlag_("H")) arguments << "-H"; - //if (getFlag_("fido-truncation")) arguments << "--fido-truncation"; - //if (getFlag_("Q")) arguments << "-Q"; - arguments << pin_file.toQString(); - } - writeLog_("Prepared percolator input."); - - //------------------------------------------------------------- - // run percolator - //------------------------------------------------------------- - // Percolator execution with the executable ant the arguments StringList - int status = QProcess::execute(percolator_executable.toQString(), arguments); // does automatic escaping etc... - if (status != 0) - { - writeLog_("Percolator problem. Aborting! Calling command was: '" + percolator_executable + " \"" + arguments.join("-").toStdString() + "\"."); - // clean temporary files - if (this->debug_level_ < 2) - { - File::removeDirRecursively(temp_directory_body); - LOG_WARN << "Set debug level to >=2 to keep the temporary files at '" << temp_directory_body << "'" << std::endl; - } - else - { - LOG_WARN << "Keeping the temporary files at '" << temp_directory_body << "'. Set debug level to <2 to remove them." << std::endl; - } - return EXTERNAL_PROGRAM_ERROR; - } - writeLog_("Executed percolator!"); - - - //------------------------------------------------------------- - // reintegrate pout results - //------------------------------------------------------------- - // when percolator finished calculation, it stores the results -r option (with or without -U) or -m (which seems to be not working) - // WARNING: The -r option cannot be used in conjunction with -U: no peptide level statistics are calculated, redirecting PSM level statistics to provided file instead. - map > pep_map; - TopPerc::readPoutAsMap(pout_target_file, pep_map); - TopPerc::readPoutAsMap(pout_decoy_file, pep_map); - - // As the percolator output file is not needed anymore, the temporary directory is going to be deleted - if (this->debug_level_ < 99) - { - File::removeDirRecursively(temp_directory_body); - } - else - { - LOG_WARN << "Keeping the temporary files at '" << temp_directory_body << "'. Set debug level to <99 to remove them." << std::endl; - } - - // Add the percolator results to the peptide vector of the original input file - size_t c_debug = 0; - size_t cnt = 0; - for (vector::iterator it = peptide_ids_list.front().begin(); it != peptide_ids_list.front().end(); ++it) - { - String scan_identifier = TopPerc::getScanIdentifier(it, peptide_ids_list.front().begin()); - if (pep_map.find(scan_identifier) == pep_map.end()) - { - ++c_debug; - LOG_DEBUG << "No suitable PeptideIdentification entry found for .pout entry " << scan_identifier << endl; - continue; - } - - //check each PeptideHit for compliance with one of the PercolatorResults (by sequence) - for (vector::iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) - { - String pis = hit->getSequence().toUnmodifiedString(); - for (vector::iterator pr = pep_map.find(scan_identifier)->second.begin(); pr != pep_map.find(scan_identifier)->second.end(); ++pr) - { - if (pis == pr->peptide && - pr->preAA == hit->getPeptideEvidences().front().getAABefore() && - pr->postAA == hit->getPeptideEvidences().front().getAAAfter()) - { - hit->setMetaValue("MS:1001492", pr->score); // svm score - hit->setMetaValue("MS:1001491", pr->qvalue); // percolator q value - hit->setMetaValue("MS:1001493", pr->posterior_error_prob); // percolator pep - ++cnt; - } - } - } - } - LOG_INFO << "No suitable PeptideIdentification for " << c_debug << " out of " << peptide_ids_list.front().size() << endl; - LOG_INFO << "No suitable PeptideHits for " << cnt << " found." << endl; - - for (vector::iterator it = protein_ids_list.front().begin(); it != protein_ids_list.front().end(); ++it) - { - //will not be set because ALL decoy hits got no new score - //it->setSearchEngine("Percolator"); - //it->setScoreType("q-value"); - //it->setHigherScoreBetter(false); - - //TODO add software percolator and topperc - it->setMetaValue("percolator", "TopPerc"); - ProteinIdentification::SearchParameters sp = it->getSearchParameters(); - //TODO write all percolator parameters as set here in sp - it->setSearchParameters(sp); - } - - // Storing the PeptideHits with calculated q-value, pep and svm score - if (!mzid_out.empty()) - { - MzIdentMLFile().store(mzid_out.toQString().toStdString(), protein_ids_list.front(), peptide_ids_list.front()); - } - if (!out.empty()) - { - IdXMLFile().store(out.toQString().toStdString(), protein_ids_list.front(), peptide_ids_list.front()); - } - - writeLog_("TopPerc finished successfully!"); - return EXECUTION_OK; - } - -}; - - -int main(int argc, const char** argv) -{ - TOPPPercolator tool; - - return tool.main(argc, argv); -} - -/// @endcond diff --git a/src/utils/executables.cmake b/src/utils/executables.cmake index 9d8d6274d70..2f71eceeeba 100644 --- a/src/utils/executables.cmake +++ b/src/utils/executables.cmake @@ -32,6 +32,7 @@ MultiplexResolver MzMLSplitter OpenMSInfo PeakPickerIterative +PSMFeatureExtractor QCCalculator QCEmbedder QCExporter @@ -51,7 +52,6 @@ SpectraSTSearchAdapter SvmTheoreticalSpectrumGeneratorTrainer TICCalculator TransformationEvaluation -TopPerc XMLValidator ) From a541c36786aa652080d2cfbc3783604e08d285c5 Mon Sep 17 00:00:00 2001 From: Matthew The Date: Thu, 14 Jul 2016 17:34:37 +0200 Subject: [PATCH 23/41] Set up merging of several idXML files. Added options for peptide and protein-level FDRs --- .../include/OpenMS/ANALYSIS/ID/TopPerc.h | 5 +- src/openms/source/ANALYSIS/ID/TopPerc.cpp | 135 ++++---- src/topp/PercolatorAdapter.cpp | 311 ++++++++++++++---- src/utils/PSMFeatureExtractor.cpp | 111 ++++--- 4 files changed, 373 insertions(+), 189 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h index bf413c311b0..45e90d03ddb 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h @@ -51,8 +51,9 @@ namespace OpenMS { public: - static void mergeMULTISEids(std::vector& all_protein_ids, std::vector& all_peptide_ids, std::vector& new_protein_ids, std::vector& new_peptide_ids, StringList& search_engines_used); - static void concatMULTISEids(std::vector& all_protein_ids, std::vector& all_peptide_ids, std::vector& new_protein_ids, std::vector& new_peptide_ids, StringList& search_engines_used); + static void concatMULTISEPeptideIds(std::vector& all_peptide_ids, std::vector& new_peptide_ids, String search_engine); + static void mergeMULTISEPeptideIds(std::vector& all_peptide_ids, std::vector& new_peptide_ids); + static void mergeMULTISEProteinIds(std::vector& all_protein_ids, std::vector& new_protein_ids); static void addMSGFFeatures(std::vector& peptide_ids, StringList& feature_set); static void addXTANDEMFeatures(std::vector& peptide_ids, StringList& feature_set); diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/TopPerc.cpp index 58038fc812d..36b7130fd75 100644 --- a/src/openms/source/ANALYSIS/ID/TopPerc.cpp +++ b/src/openms/source/ANALYSIS/ID/TopPerc.cpp @@ -320,7 +320,7 @@ namespace OpenMS } } - void TopPerc::mergeMULTISEids(vector& all_protein_ids, vector& all_peptide_ids, vector& new_protein_ids, vector& new_peptide_ids, StringList& search_engines_used) + void TopPerc::mergeMULTISEPeptideIds(vector& all_peptide_ids, vector& new_peptide_ids) { LOG_DEBUG << "creating spectrum map" << endl; @@ -335,12 +335,6 @@ namespace OpenMS unified[spectrum_reference] = ins; } - String search_engine = new_protein_ids.front().getSearchEngine(); - if (!ListUtils::contains(search_engines_used, search_engine)) - { - search_engines_used.push_back(search_engine); - } - for (vector::iterator pit = new_peptide_ids.begin(); pit != new_peptide_ids.end(); ++pit) { PeptideIdentification ins = *pit; @@ -409,19 +403,29 @@ namespace OpenMS } all_peptide_ids.swap(swip); LOG_DEBUG << "Now containing " << all_peptide_ids.size() << " spectra identifications."<< endl; - - + } + + // references from PeptideHits to ProteinHits work with the protein accessions, so no need to update the PeptideHits + void TopPerc::mergeMULTISEProteinIds(vector& all_protein_ids, vector& new_protein_ids) + { LOG_DEBUG << "merging search parameters" << endl; //care for search parameters!! - all_protein_ids.front().setIdentifier("TopPerc_multiple_SE_input"); - all_protein_ids.front().setDateTime(DateTime::currentDateTime()); - + if (all_protein_ids.empty()) + { + all_protein_ids.push_back(ProteinIdentification()); + DateTime now = DateTime::now(); + String date_string = now.getDate(); + String identifier = "TopPerc_" + date_string; + all_protein_ids.front().setDateTime(now); + all_protein_ids.front().setIdentifier(identifier); + all_protein_ids.front().setSearchParameters(new_protein_ids.front().getSearchParameters()); + } std::vector& all_protein_hits = all_protein_ids.front().getHits(); - std::vector& new_protein_hits = all_protein_ids.front().getHits(); - all_protein_hits.resize(new_protein_hits.size() + all_protein_hits.size()); + std::vector& new_protein_hits = new_protein_ids.front().getHits(); + LOG_DEBUG << "Sorting " << new_protein_hits.size() << " new ProteinHits." << endl; std::sort(new_protein_hits.begin(), new_protein_hits.end(), TopPerc::lq_ProteinHit()); - LOG_DEBUG << "Sorted " << new_protein_hits.size() << " new ProteinHits." << endl; + LOG_DEBUG << "Melting with " << all_protein_hits.size() << " previous ProteinHits." << endl; if (all_protein_hits.empty()) { @@ -429,48 +433,65 @@ namespace OpenMS } else { - std::vector::iterator uni = std::set_union( + std::vector tmp_protein_hits(new_protein_hits.size() + all_protein_hits.size()); + std::vector::iterator uni = set_union( all_protein_hits.begin(), all_protein_hits.end(), - new_protein_hits.begin(), new_protein_hits.end(), all_protein_hits.begin(), + new_protein_hits.begin(), new_protein_hits.end(), tmp_protein_hits.begin(), TopPerc::lq_ProteinHit() ); - all_protein_hits.resize(uni - all_protein_hits.begin()); + tmp_protein_hits.resize(uni - tmp_protein_hits.begin()); + all_protein_hits.swap(tmp_protein_hits); } - LOG_DEBUG << "Melting ProteinHits." << endl; LOG_DEBUG << "Done with next ProteinHits." << endl; - ProteinIdentification::SearchParameters sp = new_protein_ids.front().getSearchParameters(); - String SE = new_protein_ids.front().getSearchEngine(); - LOG_DEBUG << "Melting Parameters from " << SE << " into MetaInfo." << endl; - {//insert into MetaInfo as SE:param - all_protein_ids.front().setMetaValue("SE:"+SE,new_protein_ids.front().getSearchEngineVersion()); - all_protein_ids.front().setMetaValue(SE+":db",sp.db); - all_protein_ids.front().setMetaValue(SE+":db_version",sp.db_version); - all_protein_ids.front().setMetaValue(SE+":taxonomy",sp.taxonomy); - all_protein_ids.front().setMetaValue(SE+":charges",sp.charges); - all_protein_ids.front().setMetaValue(SE+":fixed_modifications",ListUtils::concatenate(sp.fixed_modifications, ",")); - all_protein_ids.front().setMetaValue(SE+":variable_modifications",ListUtils::concatenate(sp.variable_modifications, ",")); - all_protein_ids.front().setMetaValue(SE+":missed_cleavages",sp.missed_cleavages); - all_protein_ids.front().setMetaValue(SE+":fragment_mass_tolerance",sp.fragment_mass_tolerance); - all_protein_ids.front().setMetaValue(SE+":fragment_mass_tolerance_ppm",sp.fragment_mass_tolerance_ppm); - all_protein_ids.front().setMetaValue(SE+":precursor_tolerance",sp.precursor_tolerance); - all_protein_ids.front().setMetaValue(SE+":precursor_mass_tolerance_ppm",sp.precursor_mass_tolerance_ppm); - all_protein_ids.front().setMetaValue(SE+":digestion_enzyme",sp.digestion_enzyme.getName()); - all_protein_ids.front().setPrimaryMSRunPath(new_protein_ids.front().getPrimaryMSRunPath()); - all_protein_ids.front().setSearchEngine("multiple"); + String SE = new_protein_ids.front().getSearchEngine(); + StringList keys; + all_protein_ids.front().getSearchParameters().getKeys(keys); + if (!ListUtils::contains(keys, "SE:" + SE)) + { + LOG_DEBUG << "Melting Parameters from " << SE << " into MetaInfo." << endl; + + //insert into MetaInfo as SE:param + ProteinIdentification::SearchParameters sp = new_protein_ids.front().getSearchParameters(); + ProteinIdentification::SearchParameters all_sp = all_protein_ids.front().getSearchParameters(); + all_sp.setMetaValue("SE:"+SE,new_protein_ids.front().getSearchEngineVersion()); + all_sp.setMetaValue(SE+":db",sp.db); + all_sp.setMetaValue(SE+":db_version",sp.db_version); + all_sp.setMetaValue(SE+":taxonomy",sp.taxonomy); + all_sp.setMetaValue(SE+":charges",sp.charges); + all_sp.setMetaValue(SE+":fixed_modifications",ListUtils::concatenate(sp.fixed_modifications, ",")); + all_sp.setMetaValue(SE+":variable_modifications",ListUtils::concatenate(sp.variable_modifications, ",")); + all_sp.setMetaValue(SE+":missed_cleavages",sp.missed_cleavages); + all_sp.setMetaValue(SE+":fragment_mass_tolerance",sp.fragment_mass_tolerance); + all_sp.setMetaValue(SE+":fragment_mass_tolerance_ppm",sp.fragment_mass_tolerance_ppm); + all_sp.setMetaValue(SE+":precursor_tolerance",sp.precursor_tolerance); + all_sp.setMetaValue(SE+":precursor_mass_tolerance_ppm",sp.precursor_mass_tolerance_ppm); + all_sp.setMetaValue(SE+":digestion_enzyme",sp.digestion_enzyme.getName()); + + if (!all_protein_ids.front().getSearchEngine().empty()) + { + all_protein_ids.front().setSearchEngine("multiple"); + } + else + { + all_protein_ids.front().setSearchEngine(SE); + LOG_DEBUG << "Setting search engine to " << SE << endl; + } LOG_DEBUG << "Done with next Parameters." << endl; + all_protein_ids.front().setSearchParameters(all_sp); } + StringList all_primary_ms_run_path = all_protein_ids.front().getPrimaryMSRunPath(); + StringList new_primary_ms_run_path = new_protein_ids.front().getPrimaryMSRunPath(); + all_primary_ms_run_path.insert(all_primary_ms_run_path.end(), new_primary_ms_run_path.begin(), new_primary_ms_run_path.end()); + all_protein_ids.front().setPrimaryMSRunPath(all_primary_ms_run_path); + LOG_DEBUG << "New primary run paths: " << ListUtils::concatenate(new_primary_ms_run_path,",") << endl; + LOG_DEBUG << "All primary run paths: " << ListUtils::concatenate(all_primary_ms_run_path,",") << endl; + LOG_DEBUG << "All merging finished." << endl; } - void TopPerc::concatMULTISEids(vector& all_protein_ids, vector& all_peptide_ids, vector& new_protein_ids, vector& new_peptide_ids, StringList& search_engines_used) - { - String search_engine = new_protein_ids.front().getSearchEngine(); - if (!ListUtils::contains(search_engines_used, search_engine)) - { - search_engines_used.push_back(search_engine); - } - + void TopPerc::concatMULTISEPeptideIds(vector& all_peptide_ids, vector& new_peptide_ids, String search_engine) + { for (vector::iterator pit = new_peptide_ids.begin(); pit != new_peptide_ids.end(); ++pit) { for (vector::iterator hit = pit->getHits().begin(); hit != pit->getHits().end(); ++hit) @@ -500,28 +521,6 @@ namespace OpenMS } } all_peptide_ids.insert(all_peptide_ids.end(), new_peptide_ids.begin(), new_peptide_ids.end()); - - ProteinIdentification::SearchParameters sp = new_protein_ids.front().getSearchParameters(); - String SE = new_protein_ids.front().getSearchEngine(); - LOG_DEBUG << "Melting Parameters from " << SE << " into MetaInfo." << endl; - {//insert into MetaInfo as SE:param - all_protein_ids.front().setMetaValue("SE:"+SE,new_protein_ids.front().getSearchEngineVersion()); - all_protein_ids.front().setMetaValue(SE+":db",sp.db); - all_protein_ids.front().setMetaValue(SE+":db_version",sp.db_version); - all_protein_ids.front().setMetaValue(SE+":taxonomy",sp.taxonomy); - all_protein_ids.front().setMetaValue(SE+":charges",sp.charges); - all_protein_ids.front().setMetaValue(SE+":fixed_modifications",ListUtils::concatenate(sp.fixed_modifications, ",")); - all_protein_ids.front().setMetaValue(SE+":variable_modifications",ListUtils::concatenate(sp.variable_modifications, ",")); - all_protein_ids.front().setMetaValue(SE+":missed_cleavages",sp.missed_cleavages); - all_protein_ids.front().setMetaValue(SE+":fragment_mass_tolerance",sp.fragment_mass_tolerance); - all_protein_ids.front().setMetaValue(SE+":fragment_mass_tolerance_ppm",sp.fragment_mass_tolerance_ppm); - all_protein_ids.front().setMetaValue(SE+":precursor_tolerance",sp.precursor_tolerance); - all_protein_ids.front().setMetaValue(SE+":precursor_mass_tolerance_ppm",sp.precursor_mass_tolerance_ppm); - all_protein_ids.front().setMetaValue(SE+":digestion_enzyme",sp.digestion_enzyme.getName()); - all_protein_ids.front().setPrimaryMSRunPath(new_protein_ids.front().getPrimaryMSRunPath()); - all_protein_ids.front().setSearchEngine("multiple"); - LOG_DEBUG << "Done with next Parameters." << endl; - } } void TopPerc::addMULTISEFeatures(vector& peptide_ids, StringList& search_engines_used, StringList& feature_set) diff --git a/src/topp/PercolatorAdapter.cpp b/src/topp/PercolatorAdapter.cpp index c75a912e2fa..bea98600119 100644 --- a/src/topp/PercolatorAdapter.cpp +++ b/src/topp/PercolatorAdapter.cpp @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -126,7 +127,7 @@ class PercolatorAdapter : proteinIds (pl) { } - + PercolatorResult(StringList& row): proteinIds() { @@ -163,6 +164,33 @@ class PercolatorAdapter : } }; + struct PercolatorProteinResult + { + String protein_accession; + double qvalue; + double posterior_error_prob; + + PercolatorProteinResult(const String& pid, const double q, const double pep): + protein_accession (pid), + qvalue (q), + posterior_error_prob (pep) + { + } + + bool operator!=(const PercolatorProteinResult& rhs) const + { + if (protein_accession != rhs.protein_accession || qvalue != rhs.qvalue || + posterior_error_prob != rhs.posterior_error_prob) + return true; + return false; + } + + bool operator==(const PercolatorProteinResult& rhs) const + { + return !(operator !=(rhs)); + } + }; + void registerOptionsAndFlags_() { bool is_required = true; @@ -185,8 +213,12 @@ class PercolatorAdapter : #endif "Percolator executable of the installation e.g. 'percolator.exe'", is_required, !is_advanced_option, ListUtils::create("skipexists") ); + registerFlag_("peptide-level-fdrs", "Calculate peptide-level FDRs instead of PSM-level FDRs."); + registerFlag_("protein-level-fdrs", "Use the picked protein-level FDR to infer protein probabilities. Use the -fasta option and -decoy-pattern to set the Fasta file and decoy pattern."); //Advanced parameters + registerFlag_("generic-feature-set", "Use only generic (i.e. not search engine specific) features. Generating search engine specific features for common search engines by PSMFeatureExtractor will typically boost the identification rate significantly.", is_advanced_option); + registerIntOption_("subset-max-train", "", 0, "Only train an SVM on a subset of PSMs, and use the resulting score vector to evaluate the other PSMs. Recommended when analyzing huge numbers (>1 million) of PSMs. When set to 0, all PSMs are used for training as normal.", !is_required, is_advanced_option); registerDoubleOption_("cpos", "", 0.0, "Cpos, penalty for mistakes made on positive examples. Set by cross validation if not specified.", !is_required, is_advanced_option); registerDoubleOption_("cneg", "", 0.0, "Cneg, penalty for mistakes made on negative examples. Set by cross validation if not specified.", !is_required, is_advanced_option); registerDoubleOption_("testFDR", "", 0.01, "False discovery rate threshold for evaluating best cross validation result and the reported end result.", !is_required, is_advanced_option); @@ -203,10 +235,9 @@ class PercolatorAdapter : registerIntOption_("seed", "", 1, "Setting seed of the random number generator.", !is_required, is_advanced_option); registerIntOption_("doc", "", 0, "Include description of correct features", !is_required, is_advanced_option); registerFlag_("klammer", "Retention time features calculated as in Klammer et al. Only available if -doc is set", is_advanced_option); - registerFlag_("picked-protein", "Use the picked protein-level FDR to infer protein probabilities.", is_advanced_option); - registerInputFile_("fasta", "", "", "Provide the fasta file as the argument to this flag, which will be used for protein grouping based on an in-silico digest (only valid if option -picked-protein is active).", !is_required, is_advanced_option); + registerInputFile_("fasta", "", "", "Provide the fasta file as the argument to this flag, which will be used for protein grouping based on an in-silico digest (only valid if option -protein-level-fdrs is active).", !is_required, is_advanced_option); setValidFormats_("fasta", ListUtils::create("FASTA")); - registerStringOption_("decoy-pattern", "", "random", "Define the text pattern to identify the decoy proteins and/or PSMs, set this up if the label that identifies the decoys in the database is not the default (Only valid if option -picked-protein is active).", !is_required, is_advanced_option); + registerStringOption_("decoy-pattern", "", "random", "Define the text pattern to identify the decoy proteins and/or PSMs, set this up if the label that identifies the decoys in the database is not the default (Only valid if option -protein-level-fdrs is active).", !is_required, is_advanced_option); registerFlag_("post-processing-tdc", "Use target-decoy competition to assign q-values and PEPs.", is_advanced_option); } @@ -316,11 +347,8 @@ class PercolatorAdapter : } return count; } - - //TODO for all prepare* PSMId as written in PeptideIdentification::spectrum_reference - // and pre/post AA as - if begin/end of protein ([/] in PeptideEvidence) - see prepareMULTIpin - //id label scannr feature1 ... featureN peptide proteinId1 .. proteinIdM - + + //id label scannr calcmass expmass feature1 ... featureN peptide proteinId1 .. proteinIdM void preparePin_(vector& peptide_ids, StringList& feature_set, std::string& enz, TextFile& txt, int min_charge, int max_charge) { for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) @@ -377,9 +405,14 @@ class PercolatorAdapter : //peptide String sequence = ""; - sequence += String(hit.getPeptideEvidences().front().getAABefore()); // just first peptide evidence + // just first peptide evidence + String aa_before(hit.getPeptideEvidences().front().getAABefore()); + String aa_after(hit.getPeptideEvidences().front().getAAAfter()); + aa_before = aa_before=="["?'-':aa_before; + aa_after = aa_after=="]"?'-':aa_after; + sequence += aa_before; sequence += "." + hit.getSequence().toString() + "."; - sequence += String(hit.getPeptideEvidences().front().getAAAfter()); //just first peptide evidence + sequence += aa_after; hit.setMetaValue("Peptide", sequence); //proteinId1 @@ -406,7 +439,8 @@ class PercolatorAdapter : } } } - void readPoutAsMap_(String pout_file, std::map >& pep_map) + + void readPoutAsMap_(String pout_file, std::map& pep_map) { CsvFile csv_file(pout_file, '\t'); StringList row; @@ -415,25 +449,45 @@ class PercolatorAdapter : { csv_file.getRow(i, row); PercolatorResult res(row); - String spec_ref = res.PSMId; + String spec_ref = res.PSMId + res.peptide; + // retain only the best result in the unlikely case that a PSMId+peptide combination occurs multiple times if (pep_map.find(spec_ref) == pep_map.end()) { - pep_map[spec_ref] = vector(); + pep_map.insert( map::value_type ( spec_ref, res ) ); } - pep_map[spec_ref].push_back(res); } } - bool readInputFiles_(StringList in_list, vector& all_peptide_ids, vector& all_protein_ids, int& min_charge, int& max_charge, bool isDecoy) + void readProteinPoutAsMap_(String pout_protein_file, std::map& protein_map) { - bool found_decoys = false; - for (StringList::const_iterator fit = in_list.begin(); fit != in_list.end(); ++fit) + CsvFile csv_file(pout_protein_file, '\t'); + StringList row; + + for (Size i = 1; i < csv_file.rowCount(); ++i) { + csv_file.getRow(i, row); + StringList protein_accessions; + row[0].split(",", protein_accessions); + double qvalue = row[2].toDouble(); + double posterior_error_prob = row[3].toDouble(); + for (StringList::iterator it = protein_accessions.begin(); it != protein_accessions.end(); ++it) + { + protein_map.insert( map::value_type ( *it, PercolatorProteinResult(*it, qvalue, posterior_error_prob ) ) ); + } + } + } + + ExitCodes readInputFiles_(StringList in_list, vector& all_peptide_ids, vector& all_protein_ids, bool isDecoy, bool& found_decoys, int& min_charge, int& max_charge) + { + for (StringList::iterator fit = in_list.begin(); fit != in_list.end(); ++fit) + { + String file_idx(distance(in_list.begin(), fit)); vector peptide_ids; vector protein_ids; String in = *fit; FileHandler fh; FileTypes::Type in_type = fh.getType(in); + LOG_INFO << "Loading input file: " << in << endl; if (in_type == FileTypes::IDXML) { IdXMLFile().load(in, protein_ids, peptide_ids); @@ -448,6 +502,12 @@ class PercolatorAdapter : //being paranoid about the presence of target decoy denominations, which are crucial to the percolator process for (vector::iterator pit = peptide_ids.begin(); pit != peptide_ids.end(); ++pit) { + if (in_list.size() > 1) + { + String scan_identifier = getScanIdentifier_(pit, peptide_ids.begin()); + scan_identifier = "file=" + file_idx + "," + scan_identifier; + pit->setMetaValue("spectrum_reference", scan_identifier); + } for (vector::iterator pht = pit->getHits().begin(); pht != pit->getHits().end(); ++pht) { // Some Hits have no NumMatchedMainIons, and MeanError, etc. values. Have to ignore them! @@ -480,16 +540,16 @@ class PercolatorAdapter : } //paranoia check if this comes from the same search engine! (only in the first proteinidentification of the first proteinidentifications vector vector) - { - ProteinIdentification::SearchParameters all_search_parameters = all_protein_ids.front().getSearchParameters(); - ProteinIdentification::SearchParameters search_parameters = protein_ids.front().getSearchParameters(); + if (!all_protein_ids.empty()) { if (protein_ids.front().getSearchEngine() != all_protein_ids.front().getSearchEngine()) { - writeLog_("Input files are not all from the same search engine, use TOPP_PSMFeatureExtractor to merge results from different search engines if desired. Aborting!"); + writeLog_("Input files are not all from the same search engine: " + protein_ids.front().getSearchEngine() + " and " + all_protein_ids.front().getSearchEngine() + ". Use TOPP_PSMFeatureExtractor to merge results from different search engines if desired. Aborting!"); return INCOMPATIBLE_INPUT_DATA; } bool identical_extra_features = true; + ProteinIdentification::SearchParameters all_search_parameters = all_protein_ids.front().getSearchParameters(); + ProteinIdentification::SearchParameters search_parameters = protein_ids.front().getSearchParameters(); if (all_search_parameters.metaValueExists("extra_features")) { StringList all_search_feature_list = ListUtils::create(all_search_parameters.getMetaValue("extra_features").toString()); @@ -510,11 +570,42 @@ class PercolatorAdapter : writeLog_("Input files do not have the same set of extra features from TOPP_PSMFeatureExtractor. Aborting!"); return INCOMPATIBLE_INPUT_DATA; } + + if (protein_ids.front().getScoreType() != all_protein_ids.front().getScoreType() ) + { + LOG_WARN << "Warning: differing ScoreType between input files" << endl; + } + if (search_parameters.digestion_enzyme != all_search_parameters.digestion_enzyme ) + { + LOG_WARN << "Warning: differing DigestionEnzyme between input files" << endl; + } + if (search_parameters.variable_modifications != all_search_parameters.variable_modifications ) + { + LOG_WARN << "Warning: differing VarMods between input files" << endl; + } + if (search_parameters.fixed_modifications != all_search_parameters.fixed_modifications ) + { + LOG_WARN << "Warning: differing FixMods between input files" << endl; + } + if (search_parameters.charges != all_search_parameters.charges ) + { + LOG_WARN << "Warning: differing SearchCharges between input files" << endl; + } + if (search_parameters.fragment_mass_tolerance != all_search_parameters.fragment_mass_tolerance ) + { + LOG_WARN << "Warning: differing FragTol between input files" << endl; + } + if (search_parameters.precursor_tolerance != all_search_parameters.precursor_tolerance ) + { + LOG_WARN << "Warning: differing PrecTol between input files" << endl; + } } + LOG_INFO << "Merging peptide ids." << endl; all_peptide_ids.insert(all_peptide_ids.end(), peptide_ids.begin(), peptide_ids.end()); - all_protein_ids.insert(all_protein_ids.end(), protein_ids.begin(), protein_ids.end()); + LOG_INFO << "Merging protein ids." << endl; + TopPerc::mergeMULTISEProteinIds(all_protein_ids, protein_ids); } - return found_decoys; + return EXECUTION_OK; } ExitCodes main_(int, const char**) @@ -549,6 +640,9 @@ class PercolatorAdapter : printUsage_(); return ILLEGAL_PARAMETERS; } + + bool peptide_level_fdrs = getFlag_("peptide-level-fdrs"); + bool protein_level_fdrs = getFlag_("protein-level-fdrs"); //------------------------------------------------------------- // read input @@ -557,10 +651,22 @@ class PercolatorAdapter : //TODO introduce min/max charge to parameters for now take available range int max_charge = 0; int min_charge = 10; - bool found_decoys = readInputFiles_(in_list, all_peptide_ids, all_protein_ids, min_charge, max_charge, false); + bool is_decoy = false; + bool found_decoys = false; + ExitCodes read_exit = readInputFiles_(in_list, all_peptide_ids, all_protein_ids, is_decoy, found_decoys, min_charge, max_charge); + if (read_exit != EXECUTION_OK) + { + return read_exit; + } + if (!in_decoy.empty()) { - found_decoys |= readInputFiles_(in_decoy, all_peptide_ids, all_protein_ids, min_charge, max_charge, true); + is_decoy = true; + read_exit = readInputFiles_(in_decoy, all_peptide_ids, all_protein_ids, is_decoy, found_decoys, min_charge, max_charge); + if (read_exit != EXECUTION_OK) + { + return read_exit; + } } LOG_DEBUG << "Using min/max charges of " << min_charge << "/" << max_charge << endl; @@ -573,7 +679,14 @@ class PercolatorAdapter : if (all_peptide_ids.empty()) { - writeLog_("No hits found in input file. Aborting!"); + writeLog_("No peptide hits found in input file. Aborting!"); + printUsage_(); + return INPUT_FILE_EMPTY; + } + + if (all_protein_ids.empty()) + { + writeLog_("No protein hits found in input file. Aborting!"); printUsage_(); return INPUT_FILE_EMPTY; } @@ -605,8 +718,16 @@ class PercolatorAdapter : { StringList extra_feature_set = ListUtils::create(search_parameters.getMetaValue("extra_features").toString()); feature_set.insert(feature_set.end(), extra_feature_set.begin(), extra_feature_set.end()); - } else { + } + else if (getFlag_("generic-feature-set")) + { feature_set.push_back("score"); + } + else + { + writeLog_("No search engine specific features found. Generate search engine specific features using PSMFeatureExtractor or set the -generic-features-set flag to override. Aborting!"); + printUsage_(); + return INCOMPATIBLE_INPUT_DATA; } feature_set.push_back("Peptide"); @@ -622,8 +743,12 @@ class PercolatorAdapter : } String txt_designator = File::getUniqueName(); String pin_file(temp_directory_body + txt_designator + "_pin.tab"); - String pout_target_file(temp_directory_body + txt_designator + "_target_pout.tab"); - String pout_decoy_file(temp_directory_body + txt_designator + "_decoy_pout.tab"); + String pout_target_file(temp_directory_body + txt_designator + "_target_pout_psms.tab"); + String pout_decoy_file(temp_directory_body + txt_designator + "_decoy_pout_psms.tab"); + String pout_target_file_peptides(temp_directory_body + txt_designator + "_target_pout_peptides.tab"); + String pout_decoy_file_peptides(temp_directory_body + txt_designator + "_decoy_pout_peptides.tab"); + String pout_target_file_proteins(temp_directory_body + txt_designator + "_target_pout_proteins.tab"); + String pout_decoy_file_proteins(temp_directory_body + txt_designator + "_decoy_pout_proteins.tab"); TextFile txt; txt.addLine(ListUtils::concatenate(feature_set, '\t')); @@ -632,11 +757,28 @@ class PercolatorAdapter : QStringList arguments; // Check all set parameters and get them into arguments StringList - { - arguments << "-U"; + { + if (peptide_level_fdrs) + { + arguments << "-r" << pout_target_file_peptides.toQString(); + arguments << "-B" << pout_decoy_file_peptides.toQString(); + } + else + { + arguments << "-U"; + } arguments << "-m" << pout_target_file.toQString(); arguments << "-M" << pout_decoy_file.toQString(); + String fasta_file = getStringOption_("fasta"); + if (fasta_file.empty()) fasta_file = "auto"; + if (protein_level_fdrs) + { + arguments << "-f" << fasta_file.toQString(); + arguments << "-l" << pout_target_file_proteins.toQString(); + arguments << "-L" << pout_decoy_file_proteins.toQString(); + } + double cpos = getDoubleOption_("cpos"); double cneg = getDoubleOption_("cneg"); if (cpos != 0.0) arguments << "-p" << String(cpos).toQString(); @@ -649,6 +791,8 @@ class PercolatorAdapter : Int max_iter = getIntOption_("maxiter"); if (max_iter != 10) arguments << "-i" << String(max_iter).toQString(); + Int subset_max_train = getIntOption_("subset-max-train"); + if (subset_max_train > 0) arguments << "-N" << String(subset_max_train).toQString(); if (getFlag_("quick-validation")) arguments << "-x"; if (getFlag_("post-processing-tdc")) arguments << "-Y"; @@ -706,15 +850,27 @@ class PercolatorAdapter : //------------------------------------------------------------- // when percolator finished calculation, it stores the results -r option (with or without -U) or -m (which seems to be not working) // WARNING: The -r option cannot be used in conjunction with -U: no peptide level statistics are calculated, redirecting PSM level statistics to provided file instead. - map > pep_map; - readPoutAsMap_(pout_target_file, pep_map); - readPoutAsMap_(pout_decoy_file, pep_map); - + map pep_map; + if (peptide_level_fdrs) + { + readPoutAsMap_(pout_target_file_peptides, pep_map); + readPoutAsMap_(pout_decoy_file_peptides, pep_map); + } + else + { + readPoutAsMap_(pout_target_file, pep_map); + readPoutAsMap_(pout_decoy_file, pep_map); + } + + map protein_map; + readProteinPoutAsMap_(pout_target_file_proteins, protein_map); + readProteinPoutAsMap_(pout_decoy_file_proteins, protein_map); + // As the percolator output file is not needed anymore, the temporary directory is going to be deleted if (this->debug_level_ < 5) { File::removeDirRecursively(temp_directory_body); - LOG_WARN << "Set debug level to >=5 to keep the temporary files at '" << temp_directory_body << "'" << endl; + LOG_WARN << "Removing temporary directory for Percolator in/output. Set debug level to >=5 to keep the temporary files." << endl; } else { @@ -722,63 +878,86 @@ class PercolatorAdapter : } // Add the percolator results to the peptide vector of the original input file - size_t c_debug = 0; + //size_t c_debug = 0; size_t cnt = 0; + String run_identifier = all_protein_ids.front().getIdentifier(); for (vector::iterator it = all_peptide_ids.begin(); it != all_peptide_ids.end(); ++it) { + it->setIdentifier(run_identifier); + it->setScoreType("q-value"); + it->setHigherScoreBetter(false); + String scan_identifier = getScanIdentifier_(it, all_peptide_ids.begin()); - if (pep_map.find(scan_identifier) == pep_map.end()) - { - ++c_debug; - LOG_DEBUG << "No suitable PeptideIdentification entry found for .pout entry " << scan_identifier << endl; - continue; - } - + //check each PeptideHit for compliance with one of the PercolatorResults (by sequence) for (vector::iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) { - String pis = hit->getSequence().toString(); - for (vector::iterator pr = pep_map.find(scan_identifier)->second.begin(); pr != pep_map.find(scan_identifier)->second.end(); ++pr) + String peptide_sequence = hit->getSequence().toString(); + String psm_identifier = scan_identifier + peptide_sequence; + + map::iterator pr = pep_map.find(psm_identifier); + if (pr != pep_map.end()) { - if (pis == pr->peptide && - pr->preAA == hit->getPeptideEvidences().front().getAABefore() && - pr->postAA == hit->getPeptideEvidences().front().getAAAfter()) - { - hit->setMetaValue("MS:1001492", pr->score); // svm score - hit->setMetaValue("MS:1001491", pr->qvalue); // percolator q value - hit->setMetaValue("MS:1001493", pr->posterior_error_prob); // percolator pep - ++cnt; - } + hit->setMetaValue("MS:1001492", pr->second.score); // svm score + hit->setMetaValue("MS:1001491", pr->second.qvalue); // percolator q value + hit->setMetaValue("MS:1001493", pr->second.posterior_error_prob); // percolator pep + hit->setScore(pr->second.qvalue); + ++cnt; } } } - LOG_INFO << "No suitable PeptideIdentification for " << c_debug << " out of " << all_peptide_ids.size() << endl; + //LOG_INFO << "No suitable PeptideIdentification for " << c_debug << " out of " << all_peptide_ids.size() << endl; LOG_INFO << "Suitable PeptideHits for " << cnt << " found." << endl; + // TODO: There should only be 1 ProteinIdentification element in this vector, no need for a for loop for (vector::iterator it = all_protein_ids.begin(); it != all_protein_ids.end(); ++it) - { - it->setSearchEngine("Percolator"); - it->setScoreType("q-value"); - it->setHigherScoreBetter(false); - + { + if (protein_level_fdrs) + { + //check each ProteinHit for compliance with one of the PercolatorProteinResults (by accession) + for (vector::iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) + { + String protein_accession = hit->getAccession(); + map::iterator pr = protein_map.find(protein_accession); + if (pr != protein_map.end()) + { + hit->setMetaValue("MS:1001491", pr->second.qvalue); // percolator q value + hit->setMetaValue("MS:1001493", pr->second.posterior_error_prob); // percolator pep + hit->setScore(pr->second.qvalue); + } + else + { + hit->setScore(1.0); + } + } + it->setSearchEngine("Percolator"); + it->setScoreType("q-value"); + it->setHigherScoreBetter(false); + it->sort(); + } + //TODO add software percolator and PercolatorAdapter it->setMetaValue("percolator", "PercolatorAdapter"); ProteinIdentification::SearchParameters search_parameters = it->getSearchParameters(); - - search_parameters.setMetaValue("Percolator:cpos", getDoubleOption_("cpos")); - search_parameters.setMetaValue("Percolator:cneg", getDoubleOption_("cneg")); + + search_parameters.setMetaValue("Percolator:peptide-level-fdrs", peptide_level_fdrs); + search_parameters.setMetaValue("Percolator:protein-level-fdrs", protein_level_fdrs); + search_parameters.setMetaValue("Percolator:generic-feature-set", getFlag_("generic-feature-set")); search_parameters.setMetaValue("Percolator:testFDR", getDoubleOption_("testFDR")); search_parameters.setMetaValue("Percolator:trainFDR", getDoubleOption_("trainFDR")); + search_parameters.setMetaValue("Percolator:maxiter", getIntOption_("maxiter")); + search_parameters.setMetaValue("Percolator:subset-max-train", getIntOption_("subset-max-train")); search_parameters.setMetaValue("Percolator:quick-validation", getFlag_("quick-validation")); search_parameters.setMetaValue("Percolator:weights", getStringOption_("weights")); search_parameters.setMetaValue("Percolator:init-weights", getStringOption_("init-weights")); search_parameters.setMetaValue("Percolator:default-direction", getStringOption_("default-direction")); + search_parameters.setMetaValue("Percolator:cpos", getDoubleOption_("cpos")); + search_parameters.setMetaValue("Percolator:cneg", getDoubleOption_("cneg")); search_parameters.setMetaValue("Percolator:unitnorm", getFlag_("unitnorm")); search_parameters.setMetaValue("Percolator:override", getFlag_("override")); search_parameters.setMetaValue("Percolator:seed", getIntOption_("seed")); search_parameters.setMetaValue("Percolator:doc", getIntOption_("doc")); search_parameters.setMetaValue("Percolator:klammer", getFlag_("klammer")); - search_parameters.setMetaValue("Percolator:picked-protein", getFlag_("picked-protein")); search_parameters.setMetaValue("Percolator:fasta", getStringOption_("fasta")); search_parameters.setMetaValue("Percolator:decoy-pattern", getStringOption_("decoy-pattern")); search_parameters.setMetaValue("Percolator:post-processing-tdc", getFlag_("post-processing-tdc")); diff --git a/src/utils/PSMFeatureExtractor.cpp b/src/utils/PSMFeatureExtractor.cpp index 4299ab9827d..adb3fe8f99d 100644 --- a/src/utils/PSMFeatureExtractor.cpp +++ b/src/utils/PSMFeatureExtractor.cpp @@ -106,9 +106,10 @@ class PSMFeatureExtractor : registerOutputFile_("out", "", "", "Output file in idXML format", false); registerOutputFile_("mzid_out", "", "", "Output file in mzid format", false); registerFlag_("multiple_search_engines", "Combine PSMs from different search engines by merging on scan level."); - - registerFlag_("MHC", "Add a feature for MHC ligand properties to the specific PSM.", true); - registerFlag_("override_db_check", "Manual override to check if same settings for multiple search engines were applied.", true); + + // TODO: add this MHC feature back in with TopPerc::hasMHCEnd_() + //registerFlag_("MHC", "Add a feature for MHC ligand properties to the specific PSM.", true); + registerFlag_("skip_db_check", "Manual override to skip the check if same settings for multiple search engines were applied.", true); registerFlag_("concat", "Naive merging of PSMs from different search engines: concatenate multiple search results instead of merging on scan level. Only valid together wtih -multiple_search_engines flag.", true); } @@ -119,13 +120,20 @@ class PSMFeatureExtractor : //------------------------------------------------------------- vector all_peptide_ids; vector all_protein_ids; - + //------------------------------------------------------------- // parsing parameters //------------------------------------------------------------- const StringList in_list = getStringList_("in"); + bool multiple_search_engines = getFlag_("multiple_search_engines"); LOG_DEBUG << "Input file (of target?): " << ListUtils::concatenate(in_list, ",") << endl; - + if (in_list.size() > 1 && !multiple_search_engines) + { + writeLog_("Fatal error: multiple input files given for -in, but -multiple_search_engines flag not specified. If the same search engine was used, feed the input files into PSMFeatureExtractor one by one."); + printUsage_(); + return ILLEGAL_PARAMETERS; + } + const String mzid_out(getStringOption_("mzid_out")); const String out(getStringOption_("out")); if (mzid_out.empty() && out.empty()) @@ -138,8 +146,7 @@ class PSMFeatureExtractor : //------------------------------------------------------------- // read input //------------------------------------------------------------- - bool multiple_search_engines = getFlag_("multiple-search-engines"); - bool override_db_check = getFlag_("override_db_check"); + bool skip_db_check = getFlag_("skip_db_check"); bool concatenate = getFlag_("concat"); StringList search_engines_used; for (StringList::const_iterator fit = in_list.begin(); fit != in_list.end(); ++fit) @@ -149,6 +156,7 @@ class PSMFeatureExtractor : String in = *fit; FileHandler fh; FileTypes::Type in_type = fh.getType(in); + LOG_INFO << "Loading input file: " << in << endl; if (in_type == FileTypes::IDXML) { IdXMLFile().load(in, protein_ids, peptide_ids); @@ -158,67 +166,57 @@ class PSMFeatureExtractor : LOG_WARN << "Converting from mzid: possible loss of information depending on target format." << endl; MzIdentMLFile().load(in, protein_ids, peptide_ids); } - //else catched by TOPPBase:registerInput being mandatory mzid or idxml - - //paranoia check if this comes from the same search engine! (only in the first proteinidentification of the first proteinidentifications vector vector) + //else caught by TOPPBase:registerInput being mandatory mzid or idxml + + if (multiple_search_engines && !skip_db_check && !all_protein_ids.empty()) { ProteinIdentification::SearchParameters all_search_parameters = all_protein_ids.front().getSearchParameters(); ProteinIdentification::SearchParameters search_parameters = protein_ids.front().getSearchParameters(); - if (!multiple_search_engines && protein_ids.front().getSearchEngine() != all_protein_ids.front().getSearchEngine()) - { - writeLog_("Input files are not all from the same search engine, set -multiple_search_engines to allow this. Aborting!"); - return INCOMPATIBLE_INPUT_DATA; - } - - if (!override_db_check && search_parameters.db != all_search_parameters.db) + if (search_parameters.db != all_search_parameters.db) { - writeLog_("Input files are not searched with the same protein database, set -override_db_check flag to allow this. Aborting!"); + writeLog_("Input files are not searched with the same protein database, set -skip_db_check flag to ignore this. Aborting!"); return INCOMPATIBLE_INPUT_DATA; } - - if (protein_ids.front().getScoreType() != all_protein_ids.front().getScoreType() ) - { - LOG_WARN << "Warning: differing ScoreType between input files" << endl; - } - if (search_parameters.digestion_enzyme != all_search_parameters.digestion_enzyme ) - { - LOG_WARN << "Warning: differing DigestionEnzyme between input files" << endl; - } - if (search_parameters.variable_modifications != all_search_parameters.variable_modifications ) - { - LOG_WARN << "Warning: differing VarMods between input files" << endl; - } - if (search_parameters.fixed_modifications != all_search_parameters.fixed_modifications ) - { - LOG_WARN << "Warning: differing FixMods between input files" << endl; - } - if (search_parameters.charges != all_search_parameters.charges ) - { - LOG_WARN << "Warning: differing SearchCharges between input files" << endl; - } - if (search_parameters.fragment_mass_tolerance != all_search_parameters.fragment_mass_tolerance ) - { - LOG_WARN << "Warning: differing FragTol between input files" << endl; - } - if (search_parameters.precursor_tolerance != all_search_parameters.precursor_tolerance ) - { - LOG_WARN << "Warning: differing PrecTol between input files" << endl; - } } if (!multiple_search_engines) { all_peptide_ids.insert(all_peptide_ids.end(), peptide_ids.begin(), peptide_ids.end()); } - else if (concatenate) - { - TopPerc::concatMULTISEids(all_protein_ids, all_peptide_ids, protein_ids, peptide_ids, search_engines_used); - } else { - // will collapse the list (reference) - TopPerc::mergeMULTISEids(all_protein_ids, all_peptide_ids, protein_ids, peptide_ids, search_engines_used); + String search_engine = protein_ids.front().getSearchEngine(); + if (!ListUtils::contains(search_engines_used, search_engine)) + { + search_engines_used.push_back(search_engine); + } + + if (concatenate) + { + // will concatenate the list + TopPerc::concatMULTISEPeptideIds(all_peptide_ids, peptide_ids, search_engine); + } + else + { + // will collapse the list (reference) + TopPerc::mergeMULTISEPeptideIds(all_peptide_ids, peptide_ids); + } } + TopPerc::mergeMULTISEProteinIds(all_protein_ids, protein_ids); + } + + if (all_peptide_ids.empty()) + { + writeLog_("No peptide hits found in input file. Aborting!"); + printUsage_(); + return INPUT_FILE_EMPTY; + } + + if (all_protein_ids.empty()) + { + writeLog_("No protein hits found in input file. Aborting!"); + printUsage_(); + return INPUT_FILE_EMPTY; } //------------------------------------------------------------- @@ -253,6 +251,13 @@ class PSMFeatureExtractor : return INCOMPATIBLE_INPUT_DATA; } + String run_identifier = all_protein_ids.front().getIdentifier(); + for (vector::iterator it = all_peptide_ids.begin(); it != all_peptide_ids.end(); ++it) + { + it->setIdentifier(run_identifier); + } + + // TODO: There should only be 1 ProteinIdentification element in this vector, no need for a for loop for (vector::iterator it = all_protein_ids.begin(); it != all_protein_ids.end(); ++it) { ProteinIdentification::SearchParameters search_parameters = it->getSearchParameters(); From b05e0564ec67f03aec9ec6fa7d9ca5f106b6b043 Mon Sep 17 00:00:00 2001 From: Matthew The Date: Thu, 21 Jul 2016 13:35:56 +0200 Subject: [PATCH 24/41] Fixed some issues for xtandem and multi search engine merge --- .../include/OpenMS/ANALYSIS/ID/TopPerc.h | 1 + src/openms/source/ANALYSIS/ID/TopPerc.cpp | 70 ++++++++++++++----- src/topp/PercolatorAdapter.cpp | 68 ++++++++++++------ src/utils/PSMFeatureExtractor.cpp | 2 +- 4 files changed, 100 insertions(+), 41 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h index 45e90d03ddb..25f92d51535 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h @@ -86,6 +86,7 @@ namespace OpenMS static double rescaleFragmentFeature_(double featureValue, int NumMatchedMainIons); static void assignDeltaScore_(std::vector& hits, String score_ref, String output_ref); static bool hasMHCEnd_(String peptide); + static String getScanMergeKey_(std::vector::iterator it, std::vector::iterator start); }; diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/TopPerc.cpp index 36b7130fd75..6e8c3fe57ff 100644 --- a/src/openms/source/ANALYSIS/ID/TopPerc.cpp +++ b/src/openms/source/ANALYSIS/ID/TopPerc.cpp @@ -331,7 +331,7 @@ namespace OpenMS PeptideIdentification ins = *pit; ins.setScoreType("multiple"); ins.setIdentifier("TopPerc_multiple_SE_input"); - String spectrum_reference = ins.getMetaValue("spectrum_reference"); + String spectrum_reference = getScanMergeKey_(pit, all_peptide_ids.begin()); unified[spectrum_reference] = ins; } @@ -345,7 +345,7 @@ namespace OpenMS } ins.setScoreType("multiple"); ins.setIdentifier("TopPerc_multiple_SE_input"); - String spectrum_reference = ins.getMetaValue("spectrum_reference"); + String spectrum_reference = getScanMergeKey_(pit, new_peptide_ids.begin()); //merge in unified map if (unified.find(spectrum_reference) == unified.end()) { @@ -410,6 +410,8 @@ namespace OpenMS { LOG_DEBUG << "merging search parameters" << endl; //care for search parameters!! + + String SE = new_protein_ids.front().getSearchEngine(); if (all_protein_ids.empty()) { all_protein_ids.push_back(ProteinIdentification()); @@ -418,8 +420,14 @@ namespace OpenMS String identifier = "TopPerc_" + date_string; all_protein_ids.front().setDateTime(now); all_protein_ids.front().setIdentifier(identifier); + all_protein_ids.front().setSearchEngine(SE); + LOG_DEBUG << "Setting search engine to " << SE << endl; all_protein_ids.front().setSearchParameters(new_protein_ids.front().getSearchParameters()); } + else if (all_protein_ids.front().getSearchEngine() != SE) + { + all_protein_ids.front().setSearchEngine("multiple"); + } std::vector& all_protein_hits = all_protein_ids.front().getHits(); std::vector& new_protein_hits = new_protein_ids.front().getHits(); @@ -442,8 +450,7 @@ namespace OpenMS all_protein_hits.swap(tmp_protein_hits); } LOG_DEBUG << "Done with next ProteinHits." << endl; - - String SE = new_protein_ids.front().getSearchEngine(); + StringList keys; all_protein_ids.front().getSearchParameters().getKeys(keys); if (!ListUtils::contains(keys, "SE:" + SE)) @@ -467,15 +474,6 @@ namespace OpenMS all_sp.setMetaValue(SE+":precursor_mass_tolerance_ppm",sp.precursor_mass_tolerance_ppm); all_sp.setMetaValue(SE+":digestion_enzyme",sp.digestion_enzyme.getName()); - if (!all_protein_ids.front().getSearchEngine().empty()) - { - all_protein_ids.front().setSearchEngine("multiple"); - } - else - { - all_protein_ids.front().setSearchEngine(SE); - LOG_DEBUG << "Setting search engine to " << SE << endl; - } LOG_DEBUG << "Done with next Parameters." << endl; all_protein_ids.front().setSearchParameters(all_sp); } @@ -542,11 +540,12 @@ namespace OpenMS } if (ListUtils::contains(search_engines_used, "XTandem")) { - feature_set.push_back("XTandem_score"); + //TODO: create XTandem score + //feature_set.push_back("XTandem_score"); feature_set.push_back("E-Value"); } - feature_set.push_back("MULTI:ionFrac"); - feature_set.push_back("MULTI:numHits"); + //feature_set.push_back("MULTI:ionFrac"); + //feature_set.push_back("MULTI:numHits"); // this is not informative if we only keep PSMs with hits for all search engines LOG_INFO << "Using " << ListUtils::concatenate(search_engines_used, ", ") << " as source for search engine specific features." << endl; @@ -557,8 +556,8 @@ namespace OpenMS it->assignRanks(); for (vector::iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) { - double ion_frac = hit->getMetaValue("matched_intensity").toString().toDouble() / hit->getMetaValue("sum_intensity").toString().toDouble(); // also consider "matched_ion_number"/"peak_number" - hit->setMetaValue("MULTI:ionFrac", ion_frac); + //double ion_frac = hit->getMetaValue("matched_intensity").toString().toDouble() / hit->getMetaValue("sum_intensity").toString().toDouble(); // also consider "matched_ion_number"/"peak_number" + //hit->setMetaValue("MULTI:ionFrac", ion_frac); int num_hits = hit->getScore(); hit->setMetaValue("MULTI:numHits", num_hits); @@ -608,5 +607,40 @@ namespace OpenMS } return suf; } + + // TODO: check if this is consistent for all search engines. MSGF+ and X!Tandem have been checked. + String TopPerc::getScanMergeKey_(vector::iterator it, vector::iterator start) + { + // MSGF+ uses this field, is empty if not specified + String scan_identifier = it->getMetaValue("spectrum_reference"); + if (scan_identifier.empty()) + { + // XTandem uses this (integer) field + // these ids are 1-based in contrast to the index which is 0-based, so subtract 1. + if (it->metaValueExists("spectrum_id") && !it->getMetaValue("spectrum_id").toString().empty()) + { + scan_identifier = "index=" + String(it->getMetaValue("spectrum_id").toString().toInt() - 1); + } + else + { + scan_identifier = "index=" + String(it - start + 1); + LOG_WARN << "no known spectrum identifiers, using index [1,n] - use at own risk." << endl; + } + } + + Int scan_number = 0; + StringList fields = ListUtils::create(scan_identifier); + for (StringList::const_iterator it = fields.begin(); it != fields.end(); ++it) + { + // if scan number is not available, use the scan index + Size idx = 0; + if ((idx = it->find("index=")) != string::npos) + { + scan_number = it->substr(idx + 6).toInt(); + } + } + return String(scan_number); + } + } diff --git a/src/topp/PercolatorAdapter.cpp b/src/topp/PercolatorAdapter.cpp index bea98600119..980384864e6 100644 --- a/src/topp/PercolatorAdapter.cpp +++ b/src/topp/PercolatorAdapter.cpp @@ -241,35 +241,46 @@ class PercolatorAdapter : registerFlag_("post-processing-tdc", "Use target-decoy competition to assign q-values and PEPs.", is_advanced_option); } - // TODO: add file specific scan identifiers String getScanIdentifier_(vector::iterator it, vector::iterator start) { + // MSGF+ uses this field, is empty if not specified String scan_identifier = it->getMetaValue("spectrum_reference"); if (scan_identifier.empty()) { - scan_identifier = String(it->getMetaValue("spectrum_id")); - if (scan_identifier.empty()) + // XTandem uses this (integer) field + // these ids are 1-based in contrast to the index which is 0-based. This might be problematic to use for merging + if (it->metaValueExists("spectrum_id") && !it->getMetaValue("spectrum_id").toString().empty()) { - scan_identifier = String(it - start + 1); + scan_identifier = "scan=" + it->getMetaValue("spectrum_id").toString(); + } + else + { + scan_identifier = "index=" + String(it - start + 1); LOG_WARN << "no known spectrum identifiers, using index [1,n] - use at own risk." << endl; } } return scan_identifier.removeWhitespaces(); } - // TODO: add file specific scan numbers Int getScanNumber_(String scan_identifier) { - Size idx = 0; - if ((idx = scan_identifier.find("index=")) != string::npos) - { - scan_identifier = scan_identifier.substr(idx + 6); - } - else if ((idx = scan_identifier.find("scan=")) != string::npos) + Int scan_number = 0; + StringList fields = ListUtils::create(scan_identifier); + for (StringList::const_iterator it = fields.begin(); it != fields.end(); ++it) { - scan_identifier = scan_identifier.substr(idx + 5); + // if scan number is not available, use the scan index + Size idx = 0; + if ((idx = it->find("scan=")) != string::npos) + { + scan_number = it->substr(idx + 5).toInt(); + break; + } + else if ((idx = it->find("index=")) != string::npos) + { + scan_number = it->substr(idx + 6).toInt(); + } } - return scan_identifier.toInt(); + return scan_number; } // Function adapted from Enzyme.h in Percolator converter @@ -355,6 +366,7 @@ class PercolatorAdapter : { String scan_identifier = getScanIdentifier_(it, peptide_ids.begin()); Int scan_number = getScanNumber_(scan_identifier); + double exp_mass = it->getMZ(); for (vector::const_iterator jt = it->getHits().begin(); jt != it->getHits().end(); ++jt) { @@ -362,8 +374,10 @@ class PercolatorAdapter : hit.setMetaValue("SpecId", scan_identifier); hit.setMetaValue("ScanNr", scan_number); + if (!hit.metaValueExists("target_decoy") || hit.getMetaValue("target_decoy").toString().empty()) continue; + int label = 1; - if (hit.metaValueExists("target_decoy") && String(hit.getMetaValue("target_decoy")).hasSubstring("decoy")) + if (String(hit.getMetaValue("target_decoy")).hasSubstring("decoy")) { label = -1; } @@ -750,6 +764,7 @@ class PercolatorAdapter : String pout_target_file_proteins(temp_directory_body + txt_designator + "_target_pout_proteins.tab"); String pout_decoy_file_proteins(temp_directory_body + txt_designator + "_decoy_pout_proteins.tab"); + LOG_DEBUG << "Writing percolator input file." << endl; TextFile txt; txt.addLine(ListUtils::concatenate(feature_set, '\t')); preparePin_(all_peptide_ids, feature_set, enz_str, txt, min_charge, max_charge); @@ -770,13 +785,17 @@ class PercolatorAdapter : arguments << "-m" << pout_target_file.toQString(); arguments << "-M" << pout_decoy_file.toQString(); - String fasta_file = getStringOption_("fasta"); - if (fasta_file.empty()) fasta_file = "auto"; if (protein_level_fdrs) { - arguments << "-f" << fasta_file.toQString(); arguments << "-l" << pout_target_file_proteins.toQString(); arguments << "-L" << pout_decoy_file_proteins.toQString(); + + String fasta_file = getStringOption_("fasta"); + if (fasta_file.empty()) fasta_file = "auto"; + arguments << "-f" << fasta_file.toQString(); + + String decoy_pattern = getStringOption_("decoy-pattern"); + if (decoy_pattern != "random") arguments << "-P" << decoy_pattern.toQString(); } double cpos = getDoubleOption_("cpos"); @@ -816,8 +835,6 @@ class PercolatorAdapter : Int description_of_correct = getIntOption_("doc"); if (description_of_correct != 0) arguments << "-D" << String(description_of_correct).toQString(); - String decoy_pattern = getStringOption_("decoy-pattern"); - if (decoy_pattern != "random") arguments << "-P" << decoy_pattern.toQString(); arguments << pin_file.toQString(); } writeLog_("Prepared percolator input."); @@ -863,8 +880,11 @@ class PercolatorAdapter : } map protein_map; - readProteinPoutAsMap_(pout_target_file_proteins, protein_map); - readProteinPoutAsMap_(pout_decoy_file_proteins, protein_map); + if (protein_level_fdrs) + { + readProteinPoutAsMap_(pout_target_file_proteins, protein_map); + readProteinPoutAsMap_(pout_decoy_file_proteins, protein_map); + } // As the percolator output file is not needed anymore, the temporary directory is going to be deleted if (this->debug_level_ < 5) @@ -904,6 +924,10 @@ class PercolatorAdapter : hit->setScore(pr->second.qvalue); ++cnt; } + else + { + hit->setScore(1.0); // set q-value to 1.0 if hit not found in results + } } } //LOG_INFO << "No suitable PeptideIdentification for " << c_debug << " out of " << all_peptide_ids.size() << endl; @@ -927,7 +951,7 @@ class PercolatorAdapter : } else { - hit->setScore(1.0); + hit->setScore(1.0); // set q-value to 1.0 if hit not found in results } } it->setSearchEngine("Percolator"); diff --git a/src/utils/PSMFeatureExtractor.cpp b/src/utils/PSMFeatureExtractor.cpp index adb3fe8f99d..2e4b7d05cfb 100644 --- a/src/utils/PSMFeatureExtractor.cpp +++ b/src/utils/PSMFeatureExtractor.cpp @@ -174,7 +174,7 @@ class PSMFeatureExtractor : ProteinIdentification::SearchParameters search_parameters = protein_ids.front().getSearchParameters(); if (search_parameters.db != all_search_parameters.db) { - writeLog_("Input files are not searched with the same protein database, set -skip_db_check flag to ignore this. Aborting!"); + writeLog_("Input files are not searched with the same protein database, " + search_parameters.db + " vs. " + all_search_parameters.db + ". Set -skip_db_check flag to ignore this. Aborting!"); return INCOMPATIBLE_INPUT_DATA; } } From 346bbedd2750aaf189be5c0a8b8b76d122cfb895 Mon Sep 17 00:00:00 2001 From: mwalzer Date: Thu, 24 Nov 2016 14:07:00 +0100 Subject: [PATCH 25/41] [FIX] fixed after merge to reflect renamed precursor_mass_tolerance --- src/openms/source/ANALYSIS/ID/TopPerc.cpp | 2 +- src/topp/PercolatorAdapter.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/TopPerc.cpp index 6e8c3fe57ff..b71adab544b 100644 --- a/src/openms/source/ANALYSIS/ID/TopPerc.cpp +++ b/src/openms/source/ANALYSIS/ID/TopPerc.cpp @@ -470,7 +470,7 @@ namespace OpenMS all_sp.setMetaValue(SE+":missed_cleavages",sp.missed_cleavages); all_sp.setMetaValue(SE+":fragment_mass_tolerance",sp.fragment_mass_tolerance); all_sp.setMetaValue(SE+":fragment_mass_tolerance_ppm",sp.fragment_mass_tolerance_ppm); - all_sp.setMetaValue(SE+":precursor_tolerance",sp.precursor_tolerance); + all_sp.setMetaValue(SE+":precursor_mass_tolerance",sp.precursor_mass_tolerance); all_sp.setMetaValue(SE+":precursor_mass_tolerance_ppm",sp.precursor_mass_tolerance_ppm); all_sp.setMetaValue(SE+":digestion_enzyme",sp.digestion_enzyme.getName()); diff --git a/src/topp/PercolatorAdapter.cpp b/src/topp/PercolatorAdapter.cpp index 980384864e6..f52a7ee4b26 100644 --- a/src/topp/PercolatorAdapter.cpp +++ b/src/topp/PercolatorAdapter.cpp @@ -609,7 +609,7 @@ class PercolatorAdapter : { LOG_WARN << "Warning: differing FragTol between input files" << endl; } - if (search_parameters.precursor_tolerance != all_search_parameters.precursor_tolerance ) + if (search_parameters.precursor_mass_tolerance != all_search_parameters.precursor_mass_tolerance ) { LOG_WARN << "Warning: differing PrecTol between input files" << endl; } From d1c78781ec6a02d256ff0f8bc4249367fa313897 Mon Sep 17 00:00:00 2001 From: mwalzer Date: Tue, 29 Nov 2016 10:10:53 +0100 Subject: [PATCH 26/41] [NOP] doc and code duplicate removal --- src/utils/PSMFeatureExtractor.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/utils/PSMFeatureExtractor.cpp b/src/utils/PSMFeatureExtractor.cpp index 2e4b7d05cfb..0147812c281 100644 --- a/src/utils/PSMFeatureExtractor.cpp +++ b/src/utils/PSMFeatureExtractor.cpp @@ -168,6 +168,7 @@ class PSMFeatureExtractor : } //else caught by TOPPBase:registerInput being mandatory mzid or idxml + // will check if all ProteinIdentifications have the same search db unless it is the first, in which case all_protein_ids is empty yet. if (multiple_search_engines && !skip_db_check && !all_protein_ids.empty()) { ProteinIdentification::SearchParameters all_search_parameters = all_protein_ids.front().getSearchParameters(); @@ -205,13 +206,6 @@ class PSMFeatureExtractor : TopPerc::mergeMULTISEProteinIds(all_protein_ids, protein_ids); } - if (all_peptide_ids.empty()) - { - writeLog_("No peptide hits found in input file. Aborting!"); - printUsage_(); - return INPUT_FILE_EMPTY; - } - if (all_protein_ids.empty()) { writeLog_("No protein hits found in input file. Aborting!"); From 376c21291b3ff6a4309c66bbae191653fba26d78 Mon Sep 17 00:00:00 2001 From: mwalzer Date: Tue, 29 Nov 2016 10:16:47 +0100 Subject: [PATCH 27/41] [FIX] fixed faulty getScanMergeKey_ function, added debug information --- src/openms/source/ANALYSIS/ID/TopPerc.cpp | 59 +++++++++++++++-------- 1 file changed, 40 insertions(+), 19 deletions(-) diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/TopPerc.cpp index b71adab544b..ac1caeab7d5 100644 --- a/src/openms/source/ANALYSIS/ID/TopPerc.cpp +++ b/src/openms/source/ANALYSIS/ID/TopPerc.cpp @@ -334,26 +334,33 @@ namespace OpenMS String spectrum_reference = getScanMergeKey_(pit, all_peptide_ids.begin()); unified[spectrum_reference] = ins; } - + LOG_DEBUG << "filled spectrum map with previously observed PSM: " << unified.size() << endl; + + int nc = 0; + int mc = 0; + LOG_DEBUG << "About to merge in:" << new_peptide_ids.size() << "PSMs." << endl; for (vector::iterator pit = new_peptide_ids.begin(); pit != new_peptide_ids.end(); ++pit) { PeptideIdentification ins = *pit; //prepare for merge for (vector::iterator hit = ins.getHits().begin(); hit != ins.getHits().end(); ++hit) { - hit->setScore(1); + hit->setScore(1); // new 'multiple' score is just the number of times identified by different SE } ins.setScoreType("multiple"); ins.setIdentifier("TopPerc_multiple_SE_input"); String spectrum_reference = getScanMergeKey_(pit, new_peptide_ids.begin()); //merge in unified map + // insert newly identified spectra (PeptideIdentifications) or .. if (unified.find(spectrum_reference) == unified.end()) { unified[spectrum_reference] = ins; + ++nc; } + // .. add PSMs to already identified spectrum else { - //find corresponding hit + //find corresponding hit (i.e. sequences must match) for (vector::iterator hit = ins.getHits().begin(); hit != ins.getHits().end(); ++hit) { for (vector::iterator merger = unified[spectrum_reference].getHits().begin(); merger != unified[spectrum_reference].getHits().end(); ++merger) @@ -384,8 +391,9 @@ namespace OpenMS merger->setMetaValue(*kt, hit->getMetaValue(*kt)); } } - // adds up the number of hits, as the score of each separate hit is 1 + // adds up the number of hits, as the score of each separate (new) hit is 1 merger->setScore(merger->getScore() + hit->getScore()); + ++mc; break; } } @@ -402,7 +410,7 @@ namespace OpenMS swip.push_back(it->second); } all_peptide_ids.swap(swip); - LOG_DEBUG << "Now containing " << all_peptide_ids.size() << " spectra identifications."<< endl; + LOG_DEBUG << "Now containing " << all_peptide_ids.size() << " spectra identifications, new: " << nc << " merged in: " << mc << endl; } // references from PeptideHits to ProteinHits work with the protein accessions, so no need to update the PeptideHits @@ -477,15 +485,24 @@ namespace OpenMS LOG_DEBUG << "Done with next Parameters." << endl; all_protein_ids.front().setSearchParameters(all_sp); } - - StringList all_primary_ms_run_path = all_protein_ids.front().getPrimaryMSRunPath(); - StringList new_primary_ms_run_path = new_protein_ids.front().getPrimaryMSRunPath(); - all_primary_ms_run_path.insert(all_primary_ms_run_path.end(), new_primary_ms_run_path.begin(), new_primary_ms_run_path.end()); - all_protein_ids.front().setPrimaryMSRunPath(all_primary_ms_run_path); - LOG_DEBUG << "New primary run paths: " << ListUtils::concatenate(new_primary_ms_run_path,",") << endl; - LOG_DEBUG << "All primary run paths: " << ListUtils::concatenate(all_primary_ms_run_path,",") << endl; - - LOG_DEBUG << "All merging finished." << endl; + LOG_DEBUG << "Merging primaryMSRunPaths." << endl; + try + { + StringList all_primary_ms_run_path = all_protein_ids.front().getPrimaryMSRunPath(); + StringList new_primary_ms_run_path = new_protein_ids.front().getPrimaryMSRunPath(); + + all_primary_ms_run_path.insert(all_primary_ms_run_path.end(), new_primary_ms_run_path.begin(), new_primary_ms_run_path.end()); + all_protein_ids.front().setPrimaryMSRunPath(all_primary_ms_run_path); + + LOG_DEBUG << "New primary run paths: " << ListUtils::concatenate(new_primary_ms_run_path,",") << endl; + LOG_DEBUG << "All primary run paths: " << ListUtils::concatenate(all_primary_ms_run_path,",") << endl; + } + catch (Exception::BaseException& e) + { + LOG_DEBUG << "faulty primary MS run path: " << endl; + Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, e.what(), ""); + } + LOG_DEBUG << "Merging for this file finished." << endl; } void TopPerc::concatMULTISEPeptideIds(vector& all_peptide_ids, vector& new_peptide_ids, String search_engine) @@ -628,18 +645,22 @@ namespace OpenMS } } - Int scan_number = 0; + Int scan = 0; StringList fields = ListUtils::create(scan_identifier); for (StringList::const_iterator it = fields.begin(); it != fields.end(); ++it) { - // if scan number is not available, use the scan index Size idx = 0; - if ((idx = it->find("index=")) != string::npos) + if ((idx = it->find("scan=")) != string::npos) + { + scan = it->substr(idx + 5).toInt(); + break; + } // only if scan number is not available, use the scan index + else if ((idx = it->find("index=")) != string::npos) { - scan_number = it->substr(idx + 6).toInt(); + scan = it->substr(idx + 6).toInt(); } } - return String(scan_number); + return String(scan); } From e1653064eff72fd378d670d0772df7fdcd910857 Mon Sep 17 00:00:00 2001 From: mwalzer Date: Mon, 12 Dec 2016 14:21:12 +0100 Subject: [PATCH 28/41] [FIX] fixed refactoring introduced glitches --- .../include/OpenMS/ANALYSIS/ID/TopPerc.h | 13 +- src/openms/source/ANALYSIS/ID/TopPerc.cpp | 211 ++++++++++++++++-- src/topp/PercolatorAdapter.cpp | 10 +- src/utils/PSMFeatureExtractor.cpp | 19 +- 4 files changed, 227 insertions(+), 26 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h index 25f92d51535..a54a9e9b881 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h @@ -40,10 +40,15 @@ #include #include #include +#include +#include #include #include #include +#include + +#include namespace OpenMS { @@ -52,16 +57,18 @@ namespace OpenMS public: static void concatMULTISEPeptideIds(std::vector& all_peptide_ids, std::vector& new_peptide_ids, String search_engine); - static void mergeMULTISEPeptideIds(std::vector& all_peptide_ids, std::vector& new_peptide_ids); + static void mergeMULTISEPeptideIds(std::vector& all_peptide_ids, std::vector& new_peptide_ids, String search_engine); static void mergeMULTISEProteinIds(std::vector& all_protein_ids, std::vector& new_protein_ids); static void addMSGFFeatures(std::vector& peptide_ids, StringList& feature_set); static void addXTANDEMFeatures(std::vector& peptide_ids, StringList& feature_set); static void addCOMETFeatures(std::vector& peptide_ids, StringList& feature_set); static void addMASCOTFeatures(std::vector& peptide_ids, StringList& feature_set); - static void addMULTISEFeatures(std::vector& peptide_ids, StringList& search_engines_used, StringList& feature_set); + static void addMULTISEFeatures(std::vector& peptide_ids, StringList& search_engines_used, StringList& feature_set, bool complete_only = true, bool limits_imputation = false); static void addCONCATSEFeatures(std::vector& peptide_id_list, StringList& search_engines_used, StringList& feature_set); - + + static void checkExtraFeatures(const std::vector &psms, StringList& extra_features); + struct lq_ProteinHit { inline bool operator() (const ProteinHit& h1, const ProteinHit& h2) diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/TopPerc.cpp index ac1caeab7d5..3ced541bcca 100644 --- a/src/openms/source/ANALYSIS/ID/TopPerc.cpp +++ b/src/openms/source/ANALYSIS/ID/TopPerc.cpp @@ -320,7 +320,7 @@ namespace OpenMS } } - void TopPerc::mergeMULTISEPeptideIds(vector& all_peptide_ids, vector& new_peptide_ids) + void TopPerc::mergeMULTISEPeptideIds(vector& all_peptide_ids, vector& new_peptide_ids, String search_engine) { LOG_DEBUG << "creating spectrum map" << endl; @@ -342,10 +342,47 @@ namespace OpenMS for (vector::iterator pit = new_peptide_ids.begin(); pit != new_peptide_ids.end(); ++pit) { PeptideIdentification ins = *pit; + string st = pit->getScoreType(); //prepare for merge for (vector::iterator hit = ins.getHits().begin(); hit != ins.getHits().end(); ++hit) { + // keep the hit score as meta value + if (st == "MS-GF:RawScore") + st = "MS:1002049"; + else if (st == "XTandem") + st = "MS:1001331"; + else if (st == "Mascot") + st = "MS:1001171"; + else if ((st == "expect" && search_engine == "Comet" )|| st == "Comet") + st = "MS:1002257"; + + if (!hit->metaValueExists(st)) + hit->setMetaValue(st, hit->getScore()); + hit->setScore(1); // new 'multiple' score is just the number of times identified by different SE + + // rename ambiguous meta value names to PSI cv term ids + if (search_engine == "MS-GF+") // MS-GF should have all values as PSI cv terms available anyway + { + if (hit->metaValueExists("EValue")) + hit->setMetaValue("MS:1002053", hit->getMetaValue("EValue")); + } + if (search_engine == "Mascot") + { + if (hit->metaValueExists("EValue")) + hit->setMetaValue("MS:1001172", hit->getMetaValue("EValue")); + } + if (search_engine == "Comet") + { + if (hit->metaValueExists("xcorr")) + hit->setMetaValue("MS:1002252", hit->getMetaValue("xcorr")); + } + if (search_engine == "XTandem") + { + if (hit->metaValueExists("E-Value")) + hit->setMetaValue("MS:1001330", hit->getMetaValue("E-Value")); + } + } ins.setScoreType("multiple"); ins.setIdentifier("TopPerc_multiple_SE_input"); @@ -391,6 +428,7 @@ namespace OpenMS merger->setMetaValue(*kt, hit->getMetaValue(*kt)); } } + // adds up the number of hits, as the score of each separate (new) hit is 1 merger->setScore(merger->getScore() + hit->getScore()); ++mc; @@ -417,7 +455,6 @@ namespace OpenMS void TopPerc::mergeMULTISEProteinIds(vector& all_protein_ids, vector& new_protein_ids) { LOG_DEBUG << "merging search parameters" << endl; - //care for search parameters!! String SE = new_protein_ids.front().getSearchEngine(); if (all_protein_ids.empty()) @@ -538,28 +575,38 @@ namespace OpenMS all_peptide_ids.insert(all_peptide_ids.end(), new_peptide_ids.begin(), new_peptide_ids.end()); } - void TopPerc::addMULTISEFeatures(vector& peptide_ids, StringList& search_engines_used, StringList& feature_set) + void TopPerc::addMULTISEFeatures(vector& peptide_ids, StringList& search_engines_used, StringList& feature_set, bool complete_only, bool limits_imputation) { + map > extremals; // will have as keys the below SE cv terms + vector max_better, min_better; + // This is the minimum set for each SE that should be available in all openms id files in one way or another if (ListUtils::contains(search_engines_used, "MS-GF+")) { feature_set.push_back("MS:1002049"); // rawscore feature_set.push_back("MS:1002053"); // evalue + max_better.push_back("MS:1002049"); // higher is better - start high, get min + min_better.push_back("MS:1002053"); // lower is better - start low, get max } if (ListUtils::contains(search_engines_used, "Mascot")) { - feature_set.push_back("MS:1001171"); - feature_set.push_back("EValue"); + feature_set.push_back("MS:1001171"); // score aka Mascot + feature_set.push_back("MS:1001172"); // evalue aka EValue + max_better.push_back("MS:1001171"); // higher is better - start high, get min + min_better.push_back("MS:1001172"); // lower is better - start low, get max } if (ListUtils::contains(search_engines_used, "Comet")) { - feature_set.push_back("MS:1002252"); //xcorr - feature_set.push_back("MS:1002257"); //evalue + feature_set.push_back("MS:1002252"); // xcorr + feature_set.push_back("MS:1002257"); // evalue + max_better.push_back("MS:1002252"); // higher is better - start high, get min + min_better.push_back("MS:1002257"); // lower is better - start low, get max } if (ListUtils::contains(search_engines_used, "XTandem")) { - //TODO: create XTandem score - //feature_set.push_back("XTandem_score"); - feature_set.push_back("E-Value"); + feature_set.push_back("MS:1001331"); // hyperscore aka XTandem + feature_set.push_back("MS:1001330"); // evalue aka E-Value + max_better.push_back("MS:1001331"); // higher is better - start high, get min + min_better.push_back("MS:1001330"); // lower is better - start low, get max } //feature_set.push_back("MULTI:ionFrac"); //feature_set.push_back("MULTI:numHits"); // this is not informative if we only keep PSMs with hits for all search engines @@ -567,20 +614,154 @@ namespace OpenMS LOG_INFO << "Using " << ListUtils::concatenate(search_engines_used, ", ") << " as source for search engine specific features." << endl; // get all the feature values - for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + if (!complete_only) { - it->sort(); - it->assignRanks(); - for (vector::iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) + for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) + { + for (vector::iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) + { + for (StringList::iterator feats = feature_set.begin(); feats != feature_set.end(); ++feats) + { + if (hit->metaValueExists(*feats)) + { + // TODO raise issue: MS-GF raw score values are sometimes registered as string DataValues and henceforth casted defectively + if (hit->getMetaValue(*feats).valueType() == DataValue::STRING_VALUE) + { + string recast = hit->getMetaValue(*feats); + double d = boost::lexical_cast(recast); + LOG_DEBUG << "recast: " + << recast << " " + << double(hit->getMetaValue(*feats)) << "* "; + hit->setMetaValue(*feats,d); + LOG_DEBUG << hit->getMetaValue(*feats).valueType() << " " + << hit->getMetaValue(*feats) + << endl; + } + extremals[*feats].push_back(hit->getMetaValue(*feats)); + } + } + } + } + // TODO : add optional manual extremal values settings for 'data imputation' instead of min/max or numeric_limits value + for (vector::iterator maxbt = max_better.begin(); maxbt != max_better.end(); ++maxbt) + { + map >::iterator fi = extremals.find(*maxbt); + if (fi != extremals.end()) + { + vector::iterator mymax = min_element(fi->second.begin(), fi->second.end()); + iter_swap(fi->second.begin(), mymax); + if (limits_imputation) + { + fi->second.front() = -std::numeric_limits::max(); + } + } + } + for (vector::iterator minbt = min_better.begin(); minbt != min_better.end(); ++minbt) + { + map >::iterator fi = extremals.find(*minbt); + if (fi != extremals.end()) + { + vector::iterator mymin = max_element(fi->second.begin(), fi->second.end()); + iter_swap(fi->second.begin(), mymin); + if (limits_imputation) + { + fi->second.front() = std::numeric_limits::max(); + } + } + } + } + + size_t sum_removed = 0; + size_t imputed_values = 0; + size_t observed_values = 0; + size_t complete_spectra = 0; + size_t incomplete_spectra = 0; + + LOG_DEBUG << "Looking for minimum feature set:" << ListUtils::concatenate(feature_set, ", ") << "." << endl; + + for (vector::iterator pi = peptide_ids.begin(); pi != peptide_ids.end(); ++pi) + { + pi->sort(); + pi->assignRanks(); + vector::iterator> incompletes; + + size_t imputed_back = imputed_values; + for (vector::iterator hit = pi->getHits().begin(); hit != pi->getHits().end(); ++hit) { //double ion_frac = hit->getMetaValue("matched_intensity").toString().toDouble() / hit->getMetaValue("sum_intensity").toString().toDouble(); // also consider "matched_ion_number"/"peak_number" //hit->setMetaValue("MULTI:ionFrac", ion_frac); + for (StringList::iterator feats = feature_set.begin(); feats != feature_set.end(); ++feats) + { + if (complete_only && !hit->metaValueExists(*feats)) + { + incompletes.push_back(hit); // mark for removal + break; + } + else if (!hit->metaValueExists(*feats)) + { + hit->setMetaValue(*feats, extremals[*feats].front()); // imputation + ++imputed_values; + } + else + { + ++observed_values; + } + } int num_hits = hit->getScore(); hit->setMetaValue("MULTI:numHits", num_hits); } + if (complete_only) + { + // remove incompletes + for (vector::iterator>::reverse_iterator rit = incompletes.rbegin(); rit != incompletes.rend(); ++rit) + { + pi->getHits().erase(*rit); + } + sum_removed += incompletes.size(); + } + if (incompletes.size() > 0 || imputed_back < imputed_values) + ++incomplete_spectra; + else + ++complete_spectra; + } + if (sum_removed > 0) + { + LOG_WARN << "Removed " << sum_removed << " incomplete cases of PSMs." << endl; + } + if (imputed_values > 0) + { + LOG_WARN << "Imputed " << imputed_values << " of " << observed_values+imputed_values + << " missing values. (" + << imputed_values*100.0/(imputed_values+observed_values) + << "%)" << endl; + LOG_WARN << "Affected " << incomplete_spectra << " of " << incomplete_spectra+complete_spectra + << " spectra. (" + << incomplete_spectra*100.0/(incomplete_spectra+complete_spectra) + << "%)" << endl; } } + + void TopPerc::checkExtraFeatures(const vector& psms, StringList& extra_features) + { + set unavail; + for (vector::const_iterator hit = psms.begin(); hit != psms.end(); ++hit) + { + for (StringList::iterator ef = extra_features.begin(); ef != extra_features.end(); ++ef) + { + if (!hit->metaValueExists(*ef)) + { + unavail.insert(ef); + } + } + } + for (set::reverse_iterator rit = unavail.rbegin(); rit != unavail.rend(); ++rit) + { + LOG_WARN << "A extra_feature requested (" << *(*rit) << ") was not available - removed." << endl; + extra_features.erase(*rit); + } + } + // Function adapted from MsgfplusReader in Percolator converter double TopPerc::rescaleFragmentFeature_(double featureValue, int NumMatchedMainIons) @@ -625,7 +806,7 @@ namespace OpenMS return suf; } - // TODO: check if this is consistent for all search engines. MSGF+ and X!Tandem have been checked. + // TODO: this is code redundancy to PercolatorAdapter String TopPerc::getScanMergeKey_(vector::iterator it, vector::iterator start) { // MSGF+ uses this field, is empty if not specified diff --git a/src/topp/PercolatorAdapter.cpp b/src/topp/PercolatorAdapter.cpp index f52a7ee4b26..d21e13ae37f 100644 --- a/src/topp/PercolatorAdapter.cpp +++ b/src/topp/PercolatorAdapter.cpp @@ -52,6 +52,9 @@ #include #include +#include +#include + using namespace OpenMS; using namespace std; @@ -241,6 +244,7 @@ class PercolatorAdapter : registerFlag_("post-processing-tdc", "Use target-decoy competition to assign q-values and PEPs.", is_advanced_option); } + // TODO replace with TopPerc::getScanMergeKey String getScanIdentifier_(vector::iterator it, vector::iterator start) { // MSGF+ uses this field, is empty if not specified @@ -262,6 +266,7 @@ class PercolatorAdapter : return scan_identifier.removeWhitespaces(); } + // TODO replace with TopPerc::getScanMergeKey Int getScanNumber_(String scan_identifier) { Int scan_number = 0; @@ -524,7 +529,6 @@ class PercolatorAdapter : } for (vector::iterator pht = pit->getHits().begin(); pht != pit->getHits().end(); ++pht) { - // Some Hits have no NumMatchedMainIons, and MeanError, etc. values. Have to ignore them! if (!pht->metaValueExists("target_decoy")) { if (isDecoy) @@ -550,6 +554,8 @@ class PercolatorAdapter : { min_charge = pht->getCharge(); } + + // TODO: set min/max scores? } } @@ -639,7 +645,7 @@ class PercolatorAdapter : const String percolator_executable(getStringOption_("percolator_executable")); writeDebug_(String("Path to the percolator: ") + percolator_executable, 2); - if (percolator_executable.empty()) //TODO? - TOPPBase::findExecutable after registerInputFile_("percolator_executable"... ??? + if (percolator_executable.empty()) //TODO? - TOPPBase::findExecutable after registerInputFile_("percolator_executable"... ??? { writeLog_("No percolator executable specified. Aborting!"); printUsage_(); diff --git a/src/utils/PSMFeatureExtractor.cpp b/src/utils/PSMFeatureExtractor.cpp index 0147812c281..60697a7ecaa 100644 --- a/src/utils/PSMFeatureExtractor.cpp +++ b/src/utils/PSMFeatureExtractor.cpp @@ -111,6 +111,10 @@ class PSMFeatureExtractor : //registerFlag_("MHC", "Add a feature for MHC ligand properties to the specific PSM.", true); registerFlag_("skip_db_check", "Manual override to skip the check if same settings for multiple search engines were applied.", true); registerFlag_("concat", "Naive merging of PSMs from different search engines: concatenate multiple search results instead of merging on scan level. Only valid together wtih -multiple_search_engines flag.", true); + registerStringList_("extra", "", vector(), "List of the MetaData parameters to be included in a feature set for precolator.", false, false); + // setValidStrings_("extra", ?); + registerFlag_("impute", "Will instead of discarding all PSM not unanimously detected by all SE, impute missing values by their respective scores min/max observed.", true); + registerFlag_("limit_imputation", "Will impute missing scores with the worst numerical limit (instead of min/max observed) of the respective score.", true); } ExitCodes main_(int, const char**) @@ -200,7 +204,7 @@ class PSMFeatureExtractor : else { // will collapse the list (reference) - TopPerc::mergeMULTISEPeptideIds(all_peptide_ids, peptide_ids); + TopPerc::mergeMULTISEPeptideIds(all_peptide_ids, peptide_ids, search_engine); } } TopPerc::mergeMULTISEProteinIds(all_protein_ids, protein_ids); @@ -220,9 +224,9 @@ class PSMFeatureExtractor : if (multiple_search_engines) search_engine = "multiple"; LOG_DEBUG << "Registered search engine: " << search_engine << endl; - TextFile txt; - + StringList extra_features = getStringList_("extra"); StringList feature_set; + if (search_engine == "multiple") { if (getFlag_("concat")) @@ -231,10 +235,11 @@ class PSMFeatureExtractor : } else { - TopPerc::addMULTISEFeatures(all_peptide_ids, search_engines_used, feature_set); + bool impute = getFlag_("impute"); + bool limits = getFlag_("limit_imputation"); + TopPerc::addMULTISEFeatures(all_peptide_ids, search_engines_used, feature_set, !impute, limits); } } - //TODO introduce custom feature selection from TopPerc::prepareCUSTOMpin to parameters else if (search_engine == "MS-GF+") TopPerc::addMSGFFeatures(all_peptide_ids, feature_set); else if (search_engine == "Mascot") TopPerc::addMASCOTFeatures(all_peptide_ids, feature_set); else if (search_engine == "XTandem") TopPerc::addXTANDEMFeatures(all_peptide_ids, feature_set); @@ -244,11 +249,12 @@ class PSMFeatureExtractor : writeLog_("No known input to create PSM features from. Aborting"); return INCOMPATIBLE_INPUT_DATA; } - + String run_identifier = all_protein_ids.front().getIdentifier(); for (vector::iterator it = all_peptide_ids.begin(); it != all_peptide_ids.end(); ++it) { it->setIdentifier(run_identifier); + TopPerc::checkExtraFeatures(it->getHits(), extra_features); // will remove inconsistently available features } // TODO: There should only be 1 ProteinIdentification element in this vector, no need for a for loop @@ -257,6 +263,7 @@ class PSMFeatureExtractor : ProteinIdentification::SearchParameters search_parameters = it->getSearchParameters(); search_parameters.setMetaValue("feature_extractor", "TOPP_PSMFeatureExtractor"); + feature_set.insert(feature_set.end(), extra_features.begin(), extra_features.end()); search_parameters.setMetaValue("extra_features", ListUtils::concatenate(feature_set, ",")); it->setSearchParameters(search_parameters); } From e00a3e3749608d50a8cb7dfb5ac25130d64e1d94 Mon Sep 17 00:00:00 2001 From: walzer Date: Tue, 21 Mar 2017 16:12:26 +0000 Subject: [PATCH 29/41] [FIX] fixing one comet feature fallback --- src/openms/source/ANALYSIS/ID/TopPerc.cpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/TopPerc.cpp index 3ced541bcca..8d34668f0c8 100644 --- a/src/openms/source/ANALYSIS/ID/TopPerc.cpp +++ b/src/openms/source/ANALYSIS/ID/TopPerc.cpp @@ -237,8 +237,17 @@ namespace OpenMS double ln_expect = log(hit->getMetaValue("MS:1002257").toString().toDouble()); hit->setMetaValue("COMET:lnExpect", ln_expect); - - double ln_num_sp = log(hit->getMetaValue("MS:1002255").toString().toDouble()); + + double ln_num_sp; + if (hit->metaValueExists("num_matched_peptides")) + { + double num_sp = hit->getMetaValue("num_matched_peptides").toString().toDouble(); + ln_num_sp = log(max(1.0, num_sp)); // if recorded, one can be safely assumed + } + else // fallback + { + ln_num_sp = hit->getMetaValue("MS:1002255").toString().toDouble(); + } double ln_rank_sp = log(max(1.0, hit->getMetaValue("MS:1002256").toString().toDouble())); hit->setMetaValue("COMET:lnNumSP", ln_num_sp); hit->setMetaValue("COMET:lnRankSP", ln_rank_sp); From 0f15bf55662523755e5b5941532c195df1d808ec Mon Sep 17 00:00:00 2001 From: Leon Bichmann Date: Fri, 24 Mar 2017 15:53:20 +0100 Subject: [PATCH 30/41] COMET Adapter XTandem parts replaced by COMET, header still required to be changed --- src/utils/COMETAdapter.cpp | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/src/utils/COMETAdapter.cpp b/src/utils/COMETAdapter.cpp index 48ef6f7e206..015bdc730ad 100755 --- a/src/utils/COMETAdapter.cpp +++ b/src/utils/COMETAdapter.cpp @@ -29,15 +29,13 @@ // // -------------------------------------------------------------------------- // $Maintainer: Chris Bielow $ -// $Authors: Leon Bichmann, Andreas Bertsch, Chris Bielow $ +// $Authors: Leon Bichmann, Timo Sachsenberg, Andreas Bertsch, Chris Bielow $ // -------------------------------------------------------------------------- #include #include #include #include -#include -#include #include #include #include @@ -62,15 +60,15 @@ using namespace std; //------------------------------------------------------------- /** - @page TOPP_XTandemAdapter XTandemAdapter + @page TOPP_COMETAdapter COMETAdapter - @brief Identifies peptides in MS/MS spectra via XTandem. + @brief Identifies peptides in MS/MS spectra via COMET.
- + @@ -80,34 +78,28 @@ using namespace std;
pot. predecessor tools \f$ \longrightarrow \f$ XTandemAdapter \f$ \longrightarrow \f$ \f$ \longrightarrow \f$ COMETAdapter \f$ \longrightarrow \f$ pot. successor tools
- @em X! Tandem must be installed before this wrapper can be used. This wrapper - has been successfully tested with several versions of X! Tandem. - The last known version to work is 2009-04-01. We encountered problems with - later versions (namely 2010-01-01). - - To speed up computations, FASTA databases can be compressed using the fasta_pro.exe - tool of @em X! Tandem. It is contained in the "bin" folder of the @em X! Tandem installation. - Refer to the docu of @em X! Tandem for further information about settings. + @em COMET must be installed before this wrapper can be used. This wrapper + has been successfully tested with several versions of COMET. This adapter supports relative database filenames, which (when not found in the current working directory) is looked up in the directories specified by 'OpenMS.ini:id_db_dir' (see @subpage TOPP_advanced). - X! Tandem settings not exposed by this adapter can be directly adjusted using an XML configuration file. + COMET settings not exposed by this adapter can be directly adjusted using an XML configuration file. By default, all (!) parameters available explicitly via this wrapper take precedence over the XML configuration file. The parameter "default_config_file" can be used to specify such a custom configuration. An example of a configuration file (named "default_input.xml") is contained in the "bin" folder of the - @em X! Tandem installation and the OpenMS installation under OpenMS/share/CHEMISTRY/XTandem_default_input.xml. + @em COMET installation and the OpenMS installation under OpenMS/share/CHEMISTRY/COMET_default_input.xml. The latter is loaded by default. If you want to use the XML configuration file and @em ignore most of the parameters set via this adapter, use the '-ignore_adapter_param' - flag. Then, the config given in '-default_config_file' is used exclusively and only '-in', '-out', '-database' and '-xtandem_executable' are + flag. Then, the config given in '-default_config_file' is used exclusively and only '-in', '-out', '-database' and '-COMET_executable' are taken from this adapter. @note Currently mzIdentML (mzid) is not directly supported as an input/output format of this tool. Convert mzid files to/from idXML using @ref TOPP_IDFileConverter if necessary. The command line parameters of this tool are: - @verbinclude TOPP_XTandemAdapter.cli + @verbinclude TOPP_COMETAdapter.cli INI file documentation of this tool: - @htmlinclude TOPP_XTandemAdapter.html + @htmlinclude TOPP_COMETAdapter.html */ // We do not want this class to show up in the docu: @@ -135,7 +127,7 @@ class TOPPCOMETAdapter : setValidFormats_("database", ListUtils::create("FASTA")); registerInputFile_("comet_executable", "", // choose the default value according to the platform where it will be executed - // X! Tandem compiles as tandem on OSX and tandem.exe on any other platform + // COMET compiles as tandem on OSX and tandem.exe on any other platform #if defined(__APPLE__) "comet.exe", #else From 192a204c4b9a5a4592bba3f5e541787cc3cead0f Mon Sep 17 00:00:00 2001 From: mwalzer Date: Sat, 25 Mar 2017 14:10:20 +0000 Subject: [PATCH 31/41] [DOC] percolator related tools documentation --- doc/doxygen/public/TOPP.doxygen | 1 + doc/doxygen/public/UTILS.doxygen | 2 + .../include/OpenMS/ANALYSIS/ID/TopPerc.h | 100 ++++++++++++++++++ src/openms/source/ANALYSIS/ID/TopPerc.cpp | 4 +- src/topp/PercolatorAdapter.cpp | 30 +++--- src/utils/PSMFeatureExtractor.cpp | 57 +++++----- 6 files changed, 155 insertions(+), 39 deletions(-) diff --git a/doc/doxygen/public/TOPP.doxygen b/doc/doxygen/public/TOPP.doxygen index aa3b9b1380c..0e369506a3c 100755 --- a/doc/doxygen/public/TOPP.doxygen +++ b/doc/doxygen/public/TOPP.doxygen @@ -139,6 +139,7 @@ - @subpage TOPP_PeptideIndexer - Refreshes the protein references for all peptide hits. - @subpage TOPP_PhosphoScoring - Scores potential phosphorylation sites in order to localize the most probable sites. - @subpage TOPP_ProteinInference - Infer proteins from a list of (high-confidence) peptides. + - @subpage TOPP_PercolatorAdapter - Percolates the given protein/peptide identifications. Targeted Experiments - @subpage TOPP_InclusionExclusionListCreator - Creates inclusion and/or exclusion lists for LC-MS/MS experiments. diff --git a/doc/doxygen/public/UTILS.doxygen b/doc/doxygen/public/UTILS.doxygen index 1b1c10995f9..910b9380508 100644 --- a/doc/doxygen/public/UTILS.doxygen +++ b/doc/doxygen/public/UTILS.doxygen @@ -78,6 +78,8 @@ - @subpage UTILS_RNPxl - Tool for RNP cross linking experiment analysis. - @subpage UTILS_SequenceCoverageCalculator - Prints information about idXML files. - @subpage UTILS_SpecLibCreator - Creates an MSP-formatted spectral library. + - @subpage UTILS_PSMFeatureExtractor - Creates search engine specific featrues for PercolatorAdapter input. + Quantitation - @subpage UTILS_ERPairFinder - Evaluate pair ratios on enhanced resolution (zoom) scans. diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h index a54a9e9b881..0605b12c7d5 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h @@ -52,23 +52,115 @@ namespace OpenMS { + /** + @brief Percolator feature and I/O handler + + This class contains functions to handle (compute, aggregate) Percolator + features. + */ + class OPENMS_DLLAPI TopPerc { public: + /** + * @brief concatMULTISEPeptideIds + * @param all_peptide_ids PeptideIdentification vector to append to + * @param new_peptide_ids PeptideIdentification vector to be appended + * @param search_engine search engine to depend on for feature creation + * + * Appends a vector of PeptideIdentification to another and registers concatenation Percolator features depending on given search engine. + */ static void concatMULTISEPeptideIds(std::vector& all_peptide_ids, std::vector& new_peptide_ids, String search_engine); + + /** + * @brief mergeMULTISEPeptideIds + * @param all_peptide_ids PeptideIdentification vector to be merged into + * @param new_peptide_ids PeptideIdentification vector to merge + * @param search_engine search engine to depend on for feature creation + * + * Merges a vector of PeptideIdentification into another and registers merge Percolator features depending on given search engine. + */ static void mergeMULTISEPeptideIds(std::vector& all_peptide_ids, std::vector& new_peptide_ids, String search_engine); + + /** + * @brief mergeMULTISEProteinIds + * @param all_protein_ids ProteinIdentification vector to be merged into + * @param new_protein_ids ProteinIdentification vector to merge + * + * Concatenates SearchParameter of multiple search engine runs and merges PeptideEvidences, registers the created Percolator features + */ static void mergeMULTISEProteinIds(std::vector& all_protein_ids, std::vector& new_protein_ids); + + /** + * @brief addMSGFFeatures + * @param peptide_ids PeptideIdentification vector to create Percolator featrues in + * @param feature_set register of added features + * + * Creates and adds MSGF+ specific Percolator features and registers them in feature_set + */ static void addMSGFFeatures(std::vector& peptide_ids, StringList& feature_set); + + /** + * @brief addXTANDEMFeatures + * @param peptide_ids PeptideIdentification vector to create Percolator featrues in + * @param feature_set register of added features + * + * Creates and adds X!Tandem specific Percolator features and registers them in feature_set + */ static void addXTANDEMFeatures(std::vector& peptide_ids, StringList& feature_set); + + /** + * @brief addCOMETFeatures + * @param peptide_ids PeptideIdentification vector to create Percolator featrues in + * @param feature_set register of added features + * + * Creates and adds Comet specific Percolator features and registers them in feature_set + */ static void addCOMETFeatures(std::vector& peptide_ids, StringList& feature_set); + + /** + * @brief addMASCOTFeatures + * @param peptide_ids PeptideIdentification vector to create Percolator featrues in + * @param feature_set register of added features + * + * Creates and adds Mascot specific Percolator features and registers them in feature_set + */ static void addMASCOTFeatures(std::vector& peptide_ids, StringList& feature_set); + + /** + * @brief addMULTISEFeatures + * @param peptide_ids PeptideIdentification vector to create Percolator featrues in + * @param search_engines_used the list of search engines to be considered + * @param feature_set register of added features + * @param complete_only will only add features for PeptideIdentifications where all given search engines identified something + * @param limits_imputation + * + * Adds multiple search engine specific Percolator features and registers them in feature_set + */ static void addMULTISEFeatures(std::vector& peptide_ids, StringList& search_engines_used, StringList& feature_set, bool complete_only = true, bool limits_imputation = false); + + /** + * @brief addCONCATSEFeatures + * @param peptide_id_list PeptideIdentification vector to create Percolator featrues in + * @param search_engines_used the list of search engines to be considered + * @param feature_set register of added features + * + * Adds multiple search engine specific Percolator features and registers them in feature_set + */ static void addCONCATSEFeatures(std::vector& peptide_id_list, StringList& search_engines_used, StringList& feature_set); + /** + * @brief checkExtraFeatures + * @param psms the vector of PeptideHit to be checked + * @param extra_features the list of requested extra features + * + * checks and removes requested extra Percolator features that are actually inavailable (to compute) + */ static void checkExtraFeatures(const std::vector &psms, StringList& extra_features); + /// For accession dependent sorting of ProteinHits struct lq_ProteinHit { inline bool operator() (const ProteinHit& h1, const ProteinHit& h2) @@ -77,6 +169,7 @@ namespace OpenMS } }; + /// For accession dependent sorting of PeptideEvidences struct lq_PeptideEvidence { inline bool operator() (const PeptideEvidence& h1, const PeptideEvidence& h2) @@ -90,9 +183,16 @@ namespace OpenMS TopPerc(); virtual ~TopPerc(); + /// Rescales the fragment features to penalize features calculated by few ions, adapted from MSGFtoPercolator static double rescaleFragmentFeature_(double featureValue, int NumMatchedMainIons); + + /// helper functin for assigning the frequently occurring feature delta score static void assignDeltaScore_(std::vector& hits, String score_ref, String output_ref); + + /// helper function to check for human MHC typical peptide termini static bool hasMHCEnd_(String peptide); + + /// gets the scan identifer to merge by static String getScanMergeKey_(std::vector::iterator it, std::vector::iterator start); }; diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/TopPerc.cpp index 8d34668f0c8..94c7d46843e 100644 --- a/src/openms/source/ANALYSIS/ID/TopPerc.cpp +++ b/src/openms/source/ANALYSIS/ID/TopPerc.cpp @@ -305,8 +305,8 @@ namespace OpenMS { bool unique_to_protein = (String(hit->getMetaValue("protein_references")) == "unique"); bool has_mod = hit->getSequence().isModified(); - hit->setMetaValue("COMET:uniqueToProt", unique_to_protein); - hit->setMetaValue("COMET:hasMod", has_mod); + hit->setMetaValue("MASCOT:uniqueToProt", unique_to_protein); + hit->setMetaValue("MASCOT:hasMod", has_mod); } } } diff --git a/src/topp/PercolatorAdapter.cpp b/src/topp/PercolatorAdapter.cpp index d21e13ae37f..844bf764e27 100644 --- a/src/topp/PercolatorAdapter.cpp +++ b/src/topp/PercolatorAdapter.cpp @@ -72,20 +72,24 @@ using namespace std; @experimental This tool is work in progress and usage and input requirements might change.
- - - - - - - - - - -
potential predecessor tools \f$ \longrightarrow \f$ MSGF+\f$ \longrightarrow \f$ potential successor tools
@ref TOPP_IDFilter @ref TOPP_IDMapper
+ + + + + + + + + + +
pot. predecessor tools \f$ \longrightarrow \f$ PercolatorAdapter \f$ \longrightarrow \f$ pot. successor tools
@ref UTILS_PSMFeatureExtractor @ref TOPP_IDFilter
- -

Percolator is search engine sensitive, i.e. it's input features vary, depending on the search engine.

+

Percolator is search engine sensitive, i.e. it's input features vary, +depending on the search engine. Must be prepared beforehand. If you do not want +to use the specific features, use the generic-feature-set flag. Will incorporate +the score attribute of a PSM, so be sure, the score you want is set as main +score with @ref TOPP_IDScoreSwitcher . Be aware, that you might very well +experience a perfomance loss compared to the search engine specific features.

The command line parameters of this tool are: @verbinclude TOPP_PercolatorAdapter.cli diff --git a/src/utils/PSMFeatureExtractor.cpp b/src/utils/PSMFeatureExtractor.cpp index 60697a7ecaa..c473885a0f2 100644 --- a/src/utils/PSMFeatureExtractor.cpp +++ b/src/utils/PSMFeatureExtractor.cpp @@ -58,37 +58,47 @@ using namespace std; //------------------------------------------------------------- /** - @page TOPP_PSMFeatureExtractor PSMFeatureExtractor + @page UTILS_PSMFeatureExtractor PSMFeatureExtractor @brief PSMFeatureExtractor computes extra features for each input PSM - @experimental This tool is work in progress and usage and input requirements might change. + @experimental Parts of this tool are still work in progress and usage and input requirements or output might change. (multiple_search_engine, Mascot support)
- - - - - - - - - -
potential predecessor tools \f$ \longrightarrow \f$ MSGF+\f$ \longrightarrow \f$ potential successor tools
@ref TOPP_PercolatorAdapter
+ + + + + + + + + + +
pot. predecessor tools \f$ \longrightarrow \f$ PSMFeatureExtractor \f$ \longrightarrow \f$ pot. successor tools
@ref TOPP_PeptideIndexer @ref TOPP_PercolatorAdapter
-

PSMFeatureExtractor is search engine sensitive, i.e. it's extra features vary, depending on the search engine.

+

+PSMFeatureExtractor is search engine sensitive, i.e. it's extra features +vary, depending on the search engine. Thus, please make sure the input is +compliant with TOPP SearchengineAdapter output. Also, PeptideIndexer compliant +target/decoy annotation is mandatory. +Currently supported search engines are Comet, X!Tandem, MSGF+. +Mascot support is available but in beta development. +

+ + @note if you have extra features you want to pass to percolator, use the extra + flag and list the MetaData entries containing the extra features. The command line parameters of this tool are: - @verbinclude TOPP_PSMFeatureExtractor.cli + @verbinclude UTILS_PSMFeatureExtractor.cli INI file documentation of this tool: - @htmlinclude TOPP_PSMFeatureExtractor.html + @htmlinclude UTILS_PSMFeatureExtractor.html */ // We do not want this class to show up in the docu: /// @cond TOPPCLASSES - class PSMFeatureExtractor : public TOPPBase { @@ -105,16 +115,15 @@ class PSMFeatureExtractor : setValidFormats_("in", ListUtils::create("mzid,idXML")); registerOutputFile_("out", "", "", "Output file in idXML format", false); registerOutputFile_("mzid_out", "", "", "Output file in mzid format", false); - registerFlag_("multiple_search_engines", "Combine PSMs from different search engines by merging on scan level."); - - // TODO: add this MHC feature back in with TopPerc::hasMHCEnd_() - //registerFlag_("MHC", "Add a feature for MHC ligand properties to the specific PSM.", true); - registerFlag_("skip_db_check", "Manual override to skip the check if same settings for multiple search engines were applied.", true); - registerFlag_("concat", "Naive merging of PSMs from different search engines: concatenate multiple search results instead of merging on scan level. Only valid together wtih -multiple_search_engines flag.", true); registerStringList_("extra", "", vector(), "List of the MetaData parameters to be included in a feature set for precolator.", false, false); // setValidStrings_("extra", ?); - registerFlag_("impute", "Will instead of discarding all PSM not unanimously detected by all SE, impute missing values by their respective scores min/max observed.", true); - registerFlag_("limit_imputation", "Will impute missing scores with the worst numerical limit (instead of min/max observed) of the respective score.", true); + // TODO: add this MHC feature back in with TopPerc::hasMHCEnd_() + //registerFlag_("MHC", "Add a feature for MHC ligand properties to the specific PSM.", true); + registerFlag_("multiple_search_engines", "Combine PSMs from different search engines by merging on scan level."); + registerFlag_("skip_db_check", "Manual override to skip the check if same settings for multiple search engines were applied. Only valid together with -multiple_search_engines flag.", true); + registerFlag_("concat", "Naive merging of PSMs from different search engines: concatenate multiple search results instead of merging on scan level. Only valid together with -multiple_search_engines flag.", true); + registerFlag_("impute", "Will instead of discarding all PSM not unanimously detected by all SE, impute missing values by their respective scores min/max observed. Only valid together with -multiple_search_engines flag.", true); + registerFlag_("limit_imputation", "Will impute missing scores with the worst numerical limit (instead of min/max observed) of the respective score. Only valid together with -multiple_search_engines flag.", true); } ExitCodes main_(int, const char**) From fccbca3e8d53011d696766aa633fb2d38d0e4dc8 Mon Sep 17 00:00:00 2001 From: mwalzer Date: Sun, 26 Mar 2017 14:43:27 +0100 Subject: [PATCH 32/41] [TEST] added TopPerc test suite as start --- .../include/OpenMS/ANALYSIS/ID/TopPerc.h | 3 - src/openms/source/ANALYSIS/ID/TopPerc.cpp | 63 +- .../data/combined.concat.perco.in.idXML | 826 ++++++++++++++++++ .../openms/data/combined.merge.perco.in.idXML | 179 ++++ .../openms/data/comet.topperc.idXML | 242 +++++ .../openms/data/comet.topperc_check.idXML | 347 ++++++++ .../openms/data/msgf.topperc.idXML | 483 ++++++++++ .../openms/data/msgf.topperc_check.idXML | 638 ++++++++++++++ .../openms/data/xtandem.topperc.idXML | 77 ++ .../openms/data/xtandem.topperc_check.idXML | 104 +++ .../class_tests/openms/executables.cmake | 1 + .../openms/source/TopPerc_test.cpp | 257 ++++++ 12 files changed, 3156 insertions(+), 64 deletions(-) create mode 100644 src/tests/class_tests/openms/data/combined.concat.perco.in.idXML create mode 100644 src/tests/class_tests/openms/data/combined.merge.perco.in.idXML create mode 100644 src/tests/class_tests/openms/data/comet.topperc.idXML create mode 100644 src/tests/class_tests/openms/data/comet.topperc_check.idXML create mode 100644 src/tests/class_tests/openms/data/msgf.topperc.idXML create mode 100644 src/tests/class_tests/openms/data/msgf.topperc_check.idXML create mode 100644 src/tests/class_tests/openms/data/xtandem.topperc.idXML create mode 100644 src/tests/class_tests/openms/data/xtandem.topperc_check.idXML create mode 100644 src/tests/class_tests/openms/source/TopPerc_test.cpp diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h index 0605b12c7d5..5f891e17335 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h @@ -189,9 +189,6 @@ namespace OpenMS /// helper functin for assigning the frequently occurring feature delta score static void assignDeltaScore_(std::vector& hits, String score_ref, String output_ref); - /// helper function to check for human MHC typical peptide termini - static bool hasMHCEnd_(String peptide); - /// gets the scan identifer to merge by static String getScanMergeKey_(std::vector::iterator it, std::vector::iterator start); diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/TopPerc.cpp index 94c7d46843e..9aa7338c385 100644 --- a/src/openms/source/ANALYSIS/ID/TopPerc.cpp +++ b/src/openms/source/ANALYSIS/ID/TopPerc.cpp @@ -38,50 +38,7 @@ using namespace std; namespace OpenMS -{ - /* - void TopPerc::prepareCUSTOMpin(vector& peptide_ids, vector& user_param_features) - { - // Create header for the features - string min_featureset = "SpecId, Label, ScanNr"; - StringList txt_header = ListUtils::create(min_featureset); - txt_header.insert(txt_header.end(), user_param_features.begin(), user_param_features.end() ); - txt.addLine(ListUtils::concatenate(txt_header, out_sep)); - - for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) - { - for (vector::const_iterator hit = it->getHits().begin(); hit != it->getHits().end(); ++hit) - { - String scan_identifier = getScanIdentifier(it, peptide_ids.begin()); - Int scan_number = getScanNumber(scan_identifier); - int label = 1; - if (hit->metaValueExists("target_decoy") && String(hit->getMetaValue("target_decoy")).hasSubstring("decoy")) - { - label = -1; - } - - StringList collected_feats; - collected_feats.push_back(scan_identifier); - collected_feats.push_back(String(label)); - collected_feats.push_back(String(scan_number)); - - for (vector::const_iterator feat = user_param_features.begin(); feat != user_param_features.end(); ++feat) - { - // Some Hits have no NumMatchedMainIons, and MeanError, etc. values. Have to ignore them! - if (hit->metaValueExists(*feat)) - { - collected_feats.push_back(hit->getMetaValue(*feat).toString()); - } - } - if (collected_feats.size() == user_param_features.size()) - { // only if all feats were present add - txt.addLine(ListUtils::concatenate(collected_feats, out_sep)); - } - } - } - } - */ - +{ void TopPerc::addMSGFFeatures(vector& peptide_ids, StringList& feature_set) { feature_set.push_back("MS:1002049"); // unchanged RawScore @@ -798,23 +755,7 @@ namespace OpenMS (hits.end()-1)->setMetaValue(output_ref, 0.0); //if last hit or only one hit } } - - bool TopPerc::hasMHCEnd_(String peptide) - { - bool suf = false; - static const string arr[] = {"A", "F", "I", "K", "M", "L", "R", "W", "V"}; - vector mhcends (arr, arr + sizeof(arr) / sizeof(arr[0]) ); - for (std::vector::iterator eit = mhcends.begin(); eit != mhcends.end(); ++eit) - { - if (peptide.hasSuffix(string(*eit))) - { - suf = true; - break; - } - } - return suf; - } - + // TODO: this is code redundancy to PercolatorAdapter String TopPerc::getScanMergeKey_(vector::iterator it, vector::iterator start) { diff --git a/src/tests/class_tests/openms/data/combined.concat.perco.in.idXML b/src/tests/class_tests/openms/data/combined.concat.perco.in.idXML new file mode 100644 index 00000000000..f90b5876d9d --- /dev/null +++ b/src/tests/class_tests/openms/data/combined.concat.perco.in.iddiff --git a/src/tests/class_tests/openms/data/combined.merge.perco.in.idXML b/src/tests/class_tests/openms/data/combined.merge.perco.in.idXML new file mode 100644 index 00000000000..77f5d42d17c --- /dev/null +++ b/src/tests/class_tests/openms/data/combined.merge.perco.in.idXML @@ -0,0 +1,179 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/tests/class_tests/openms/data/comet.topperc.idXML b/src/tests/class_tests/openms/data/comet.topperc.idXML new file mode 100644 index 00000000000..9a0309ea6e3 --- /dev/null +++ b/src/tests/class_tests/openms/data/comet.topperc.iddiff --git a/src/tests/class_tests/openms/data/comet.topperc_check.idXML b/src/tests/class_tests/openms/data/comet.topperc_check.idXML new file mode 100644 index 00000000000..0c39a50317d --- /dev/null +++ b/src/tests/class_tests/openms/data/comet.topperc_check.iddiff --git a/src/tests/class_tests/openms/data/msgf.topperc.idXML b/src/tests/class_tests/openms/data/msgf.topperc.idXML new file mode 100644 index 00000000000..b559403539a --- /dev/null +++ b/src/tests/class_tests/openms/data/msgf.topperc.iddiff --git a/src/tests/class_tests/openms/data/msgf.topperc_check.idXML b/src/tests/class_tests/openms/data/msgf.topperc_check.idXML new file mode 100644 index 00000000000..4be5561260e --- /dev/null +++ b/src/tests/class_tests/openms/data/msgf.topperc_check.iddiff --git a/src/tests/class_tests/openms/data/xtandem.topperc.idXML b/src/tests/class_tests/openms/data/xtandem.topperc.idXML new file mode 100644 index 00000000000..f04289e6eca --- /dev/null +++ b/src/tests/class_tests/openms/data/xtandem.topperc.idXML @@ -0,0 +1,77 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/tests/class_tests/openms/data/xtandem.topperc_check.idXML b/src/tests/class_tests/openms/data/xtandem.topperc_check.idXML new file mode 100644 index 00000000000..5fdc6b11146 --- /dev/null +++ b/src/tests/class_tests/openms/data/xtandem.topperc_check.idXML @@ -0,0 +1,104 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/tests/class_tests/openms/executables.cmake b/src/tests/class_tests/openms/executables.cmake index ae27d2b0b5b..8d127c5d2b1 100644 --- a/src/tests/class_tests/openms/executables.cmake +++ b/src/tests/class_tests/openms/executables.cmake @@ -485,6 +485,7 @@ set(analysis_executables_list SimpleSVM_test StablePairFinder_test #TargetedExperimentHelper_test + TopPerc_test TransformationDescription_test TransformationModel_test TransformationModelBSpline_test diff --git a/src/tests/class_tests/openms/source/TopPerc_test.cpp b/src/tests/class_tests/openms/source/TopPerc_test.cpp new file mode 100644 index 00000000000..d600cabccc9 --- /dev/null +++ b/src/tests/class_tests/openms/source/TopPerc_test.cpp @@ -0,0 +1,257 @@ +// -------------------------------------------------------------------------- +// OpenMS -- Open-Source Mass Spectrometry +// -------------------------------------------------------------------------- +// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, +// ETH Zurich, and Freie Universitaet Berlin 2002-2017. +// +// This software is released under a three-clause BSD license: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of any author or any participating institution +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// For a full list of authors, refer to the file AUTHORS. +// -------------------------------------------------------------------------- +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING +// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// -------------------------------------------------------------------------- +// $Maintainer: MATHIAS WALZER$ +// $Authors: MATHIAS WALZER$ +// -------------------------------------------------------------------------- + +#include +#include +#include +#include + +/////////////////////////// +#include +/////////////////////////// + +using namespace OpenMS; +using namespace std; + +bool check_pepids(vector check, vector against) +{ + std::vector upk, upkc; + TEST_EQUAL(check.size(), against.size()) + if (check.size() != against.size()) + return false; + for (size_t i = 0; i < check.size(); ++i) + { + TEST_EQUAL(check[i].getHits().size(), against[i].getHits().size()) + for (size_t j = 0; j < check[i].getHits().size(); ++j) + { + check [i].getHits()[j].getKeys(upkc); + against[i].getHits()[j].getKeys(upk); + TEST_EQUAL(upkc.size(), upk.size()) + if (upkc.size() != upk.size()) + return false; + for (size_t k = 0; k < upk.size(); ++k) + TEST_STRING_EQUAL(upkc[k],upk[k]) + } + } + return true; +} + +bool check_proids(vector check, vector against, vector fs) +{ + TEST_EQUAL(check.size(), against.size()) + if (check.size()!= against.size()) + return false; + for (size_t i = 0; i < check.size(); ++i) + TEST_EQUAL(check[i].getHits().size(), against[i].getHits().size()) + + String efc = check.front().getSearchParameters().getMetaValue("extra_features"); + TEST_STRING_EQUAL(efc, ListUtils::concatenate(fs, ",")) + return true; +} + +START_TEST(TopPerc, "$Id$") + +///////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////// + +STATUS("Preparing test inputs.") + +std::vector< PeptideIdentification > comet_check_pids; +std::vector< PeptideIdentification > msgf_check_pids; +std::vector< PeptideIdentification > xtandem_check_pids; +std::vector< PeptideIdentification > merge_check_pids; +std::vector< PeptideIdentification > concat_check_pids; +std::vector< ProteinIdentification > comet_check_pods; +std::vector< ProteinIdentification > msgf_check_pods; +std::vector< ProteinIdentification > xtandem_check_pods; +std::vector< ProteinIdentification > concat_check_pods; +std::vector< ProteinIdentification > merge_check_pods; + +IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc_check.idXML"), comet_check_pods, comet_check_pids); +IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc_check.idXML"), msgf_check_pods, msgf_check_pids); +IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("xtandem.topperc_check.idXML"), xtandem_check_pods, xtandem_check_pids); +IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("combined.merge.perco.in.idXML"), merge_check_pods, merge_check_pids); +IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("combined.concat.perco.in.idXML"), concat_check_pods, concat_check_pids); + +PeptideIdentification pid; +StringList fs; + +START_SECTION((static void concatMULTISEPeptideIds(std::vector< PeptideIdentification > &all_peptide_ids, std::vector< PeptideIdentification > &new_peptide_ids, String search_engine))) +{ + std::vector< PeptideIdentification > comet_pids; + std::vector< ProteinIdentification > comet_pods; + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); + + std::vector< PeptideIdentification > msgf_pids; + std::vector< ProteinIdentification > msgf_pods; + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); + + std::vector< PeptideIdentification > concat_pids; + TopPerc::concatMULTISEPeptideIds(concat_pids, msgf_pids, "MS-GF+"); + TopPerc::concatMULTISEPeptideIds(concat_pids, comet_pids, "Comet"); + + //check completeness of feature construction + ABORT_IF(check_pepids(concat_check_pids, concat_pids)); +} +END_SECTION + +START_SECTION((static void mergeMULTISEPeptideIds(std::vector< PeptideIdentification > &all_peptide_ids, std::vector< PeptideIdentification > &new_peptide_ids, String search_engine))) +{ + std::vector< PeptideIdentification > comet_pids; + std::vector< ProteinIdentification > comet_pods; + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); + + std::vector< PeptideIdentification > msgf_pids; + std::vector< ProteinIdentification > msgf_pods; + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); + + std::vector< PeptideIdentification > concat_pids; + TopPerc::concatMULTISEPeptideIds(concat_pids, msgf_pids, "MS-GF+"); + TopPerc::concatMULTISEPeptideIds(concat_pids, comet_pids, "Comet"); + + //check completeness of feature construction + ABORT_IF(check_pepids(concat_check_pids, concat_pids)); +} +END_SECTION + +START_SECTION((static void mergeMULTISEProteinIds(std::vector< ProteinIdentification > &all_protein_ids, std::vector< ProteinIdentification > &new_protein_ids))) +{ + std::vector< PeptideIdentification > comet_pids; + std::vector< ProteinIdentification > comet_pods; + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); + + std::vector< PeptideIdentification > msgf_pids; + std::vector< ProteinIdentification > msgf_pods; + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); + + std::vector< ProteinIdentification > merge_pods; + TopPerc::mergeMULTISEProteinIds(merge_pods, msgf_pods); + TopPerc::mergeMULTISEProteinIds(merge_pods, comet_pods); + + //check completeness of feature construction + ABORT_IF(check_proids(merge_check_pods, merge_pods, fs)); +} +END_SECTION + +START_SECTION((static void addMSGFFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &feature_set))) +{ + std::vector< PeptideIdentification > msgf_pids; + std::vector< ProteinIdentification > msgf_pods; + + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); + TopPerc::addMSGFFeatures(msgf_pids,fs); + + //check completeness of feature construction + ABORT_IF(check_pepids(msgf_check_pids, msgf_pids)); + + //check registration of percolator features for adapter + ABORT_IF(check_proids(msgf_check_pods, msgf_pods, fs)); +} +END_SECTION + +START_SECTION((static void addXTANDEMFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &feature_set))) +{ + std::vector< PeptideIdentification > xtandem_pids; + std::vector< ProteinIdentification > xtandem_pods; + + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("xtandem.topperc.idXML"), xtandem_pods, xtandem_pids); + TopPerc::addXTANDEMFeatures(xtandem_pids, fs); + + //check completeness of feature construction + ABORT_IF(check_pepids(xtandem_check_pids, xtandem_pids)); + + //check registration of percolator features for adapter + ABORT_IF(check_proids(xtandem_check_pods, xtandem_pods, fs)); +} +END_SECTION + +START_SECTION((static void addCOMETFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &feature_set))) +{ + std::vector< PeptideIdentification > comet_pids; + std::vector< ProteinIdentification > comet_pods; + + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); + TopPerc::addCOMETFeatures(comet_pids, fs); + + //check completeness of feature construction + ABORT_IF(check_pepids(comet_check_pids, comet_pids)); + + //check registration of percolator features for adapter + ABORT_IF(check_proids(comet_check_pods, comet_pods, fs)); +} +END_SECTION + +START_SECTION((static void addMASCOTFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &feature_set))) +{ + NOT_TESTABLE +} +END_SECTION + +START_SECTION((static void addMULTISEFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &search_engines_used, StringList &feature_set, bool complete_only=true, bool limits_imputation=false))) +{ + // TODO +} +END_SECTION + +START_SECTION((static void addCONCATSEFeatures(std::vector< PeptideIdentification > &peptide_id_list, StringList &search_engines_used, StringList &feature_set))) +{ + // TODO +} +END_SECTION + +START_SECTION((static void checkExtraFeatures(const std::vector< PeptideHit > &psms, StringList &extra_features))) +{ + NOT_TESTABLE +} +END_SECTION + +START_SECTION(([TopPerc::lq_PeptideEvidence] bool operator()(const PeptideEvidence &h1, const PeptideEvidence &h2))) +{ + NOT_TESTABLE +} +END_SECTION + +START_SECTION(([TopPerc::lq_ProteinHit] bool operator()(const ProteinHit &h1, const ProteinHit &h2))) +{ + NOT_TESTABLE +} +END_SECTION + + +///////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////// +END_TEST + + + From 4783b49e7f101bfe29d705d7389e64fb53b7f3a8 Mon Sep 17 00:00:00 2001 From: mwalzer Date: Sun, 26 Mar 2017 16:26:29 +0100 Subject: [PATCH 33/41] [TEST] fixed some test, some extended --- .../openms/source/TopPerc_test.cpp | 228 ++++++++++-------- 1 file changed, 122 insertions(+), 106 deletions(-) diff --git a/src/tests/class_tests/openms/source/TopPerc_test.cpp b/src/tests/class_tests/openms/source/TopPerc_test.cpp index d600cabccc9..05b4c4d0527 100644 --- a/src/tests/class_tests/openms/source/TopPerc_test.cpp +++ b/src/tests/class_tests/openms/source/TopPerc_test.cpp @@ -61,7 +61,7 @@ bool check_pepids(vector check, vector &all_peptide_ids, std::vector< PeptideIdentification > &new_peptide_ids, String search_engine))) -{ - std::vector< PeptideIdentification > comet_pids; - std::vector< ProteinIdentification > comet_pods; - IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); - - std::vector< PeptideIdentification > msgf_pids; - std::vector< ProteinIdentification > msgf_pods; - IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); - - std::vector< PeptideIdentification > concat_pids; - TopPerc::concatMULTISEPeptideIds(concat_pids, msgf_pids, "MS-GF+"); - TopPerc::concatMULTISEPeptideIds(concat_pids, comet_pids, "Comet"); - - //check completeness of feature construction - ABORT_IF(check_pepids(concat_check_pids, concat_pids)); -} -END_SECTION - -START_SECTION((static void mergeMULTISEPeptideIds(std::vector< PeptideIdentification > &all_peptide_ids, std::vector< PeptideIdentification > &new_peptide_ids, String search_engine))) -{ - std::vector< PeptideIdentification > comet_pids; - std::vector< ProteinIdentification > comet_pods; - IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); - - std::vector< PeptideIdentification > msgf_pids; - std::vector< ProteinIdentification > msgf_pods; - IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); - - std::vector< PeptideIdentification > concat_pids; - TopPerc::concatMULTISEPeptideIds(concat_pids, msgf_pids, "MS-GF+"); - TopPerc::concatMULTISEPeptideIds(concat_pids, comet_pids, "Comet"); - - //check completeness of feature construction - ABORT_IF(check_pepids(concat_check_pids, concat_pids)); -} -END_SECTION - -START_SECTION((static void mergeMULTISEProteinIds(std::vector< ProteinIdentification > &all_protein_ids, std::vector< ProteinIdentification > &new_protein_ids))) -{ - std::vector< PeptideIdentification > comet_pids; - std::vector< ProteinIdentification > comet_pods; - IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); - - std::vector< PeptideIdentification > msgf_pids; - std::vector< ProteinIdentification > msgf_pods; - IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); - - std::vector< ProteinIdentification > merge_pods; - TopPerc::mergeMULTISEProteinIds(merge_pods, msgf_pods); - TopPerc::mergeMULTISEProteinIds(merge_pods, comet_pods); - - //check completeness of feature construction - ABORT_IF(check_proids(merge_check_pods, merge_pods, fs)); -} -END_SECTION - -START_SECTION((static void addMSGFFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &feature_set))) -{ - std::vector< PeptideIdentification > msgf_pids; - std::vector< ProteinIdentification > msgf_pods; - - IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); - TopPerc::addMSGFFeatures(msgf_pids,fs); - - //check completeness of feature construction - ABORT_IF(check_pepids(msgf_check_pids, msgf_pids)); - - //check registration of percolator features for adapter - ABORT_IF(check_proids(msgf_check_pods, msgf_pods, fs)); -} -END_SECTION - -START_SECTION((static void addXTANDEMFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &feature_set))) -{ - std::vector< PeptideIdentification > xtandem_pids; - std::vector< ProteinIdentification > xtandem_pods; - - IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("xtandem.topperc.idXML"), xtandem_pods, xtandem_pids); - TopPerc::addXTANDEMFeatures(xtandem_pids, fs); - - //check completeness of feature construction - ABORT_IF(check_pepids(xtandem_check_pids, xtandem_pids)); - - //check registration of percolator features for adapter - ABORT_IF(check_proids(xtandem_check_pods, xtandem_pods, fs)); -} -END_SECTION - -START_SECTION((static void addCOMETFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &feature_set))) -{ - std::vector< PeptideIdentification > comet_pids; - std::vector< ProteinIdentification > comet_pods; - - IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); - TopPerc::addCOMETFeatures(comet_pids, fs); - - //check completeness of feature construction - ABORT_IF(check_pepids(comet_check_pids, comet_pids)); - - //check registration of percolator features for adapter - ABORT_IF(check_proids(comet_check_pods, comet_pods, fs)); -} -END_SECTION +//START_SECTION((static void concatMULTISEPeptideIds(std::vector< PeptideIdentification > &all_peptide_ids, std::vector< PeptideIdentification > &new_peptide_ids, String search_engine))) +//{ +// std::vector< PeptideIdentification > comet_pids; +// std::vector< ProteinIdentification > comet_pods; +// IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); + +// std::vector< PeptideIdentification > msgf_pids; +// std::vector< ProteinIdentification > msgf_pods; +// IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); + +// StringList ses = ListUtils::create("MS-GF+,Comet"); +// std::vector< PeptideIdentification > concat_pids; +// TopPerc::concatMULTISEPeptideIds(concat_pids, msgf_pids, "MS-GF+"); +// TopPerc::concatMULTISEPeptideIds(concat_pids, comet_pids, "Comet"); +// TopPerc::addCONCATSEFeatures(concat_pids, ses, fs); + +// //check completeness of feature construction +// ABORT_IF(!check_pepids(concat_check_pids, concat_pids)); +//} +//END_SECTION + +//START_SECTION((static void mergeMULTISEPeptideIds(std::vector< PeptideIdentification > &all_peptide_ids, std::vector< PeptideIdentification > &new_peptide_ids, String search_engine))) +//{ +// std::vector< PeptideIdentification > comet_pids; +// std::vector< ProteinIdentification > comet_pods; +// IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); + +// std::vector< PeptideIdentification > msgf_pids; +// std::vector< ProteinIdentification > msgf_pods; +// IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); + +// std::vector< PeptideIdentification > merge_pids; +// StringList ses = ListUtils::create("MS-GF+,Comet"); +// TopPerc::mergeMULTISEPeptideIds(merge_pids, msgf_pids, "MS-GF+"); +// TopPerc::mergeMULTISEPeptideIds(merge_pids, comet_pids, "Comet"); +// StringList empty_extra; +// TopPerc::addMULTISEFeatures(merge_pids, ses, empty_extra, true); +// for (size_t i = merge_pids.size()-1; i > 0; --i) +// { +// TopPerc::checkExtraFeatures(merge_pids[i].getHits(), empty_extra); // also check against empty extra features list and inconsistency removal +// merge_pids.erase(merge_pids.begin()+i); //erase to be able to use completeness check function below +// } +// //check completeness of feature construction +// ABORT_IF(!check_pepids(merge_check_pids, merge_pids)); +//} +//END_SECTION + +//START_SECTION((static void mergeMULTISEProteinIds(std::vector< ProteinIdentification > &all_protein_ids, std::vector< ProteinIdentification > &new_protein_ids))) +//{ +// std::vector< PeptideIdentification > comet_pids; +// std::vector< ProteinIdentification > comet_pods; +// IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); + +// std::vector< PeptideIdentification > msgf_pids; +// std::vector< ProteinIdentification > msgf_pods; +// IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); + +// std::vector< ProteinIdentification > merge_pods; +// TopPerc::mergeMULTISEProteinIds(merge_pods, msgf_pods); +// TopPerc::mergeMULTISEProteinIds(merge_pods, comet_pods); + +// StringList fs; +// std::vector< PeptideIdentification > merge_pids; +// StringList ses = ListUtils::create("MS-GF+,Comet"); +// TopPerc::mergeMULTISEPeptideIds(merge_pids, msgf_pids, "MS-GF+"); +// TopPerc::mergeMULTISEPeptideIds(merge_pids, comet_pids, "Comet"); +// TopPerc::addMULTISEFeatures(merge_pids, ses, fs, true); + +// //check completeness of feature construction +// ABORT_IF(!check_proids(merge_check_pods, merge_pods, fs)); +//} +//END_SECTION + +//START_SECTION((static void addMSGFFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &feature_set))) +//{ +// std::vector< PeptideIdentification > msgf_pids; +// std::vector< ProteinIdentification > msgf_pods; + +// IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); +// TopPerc::addMSGFFeatures(msgf_pids,fs); + +// //check completeness of feature construction +// ABORT_IF(check_pepids(msgf_check_pids, msgf_pids)); + +// //check registration of percolator features for adapter +// ABORT_IF(!check_proids(msgf_check_pods, msgf_pods, fs)); +//} +//END_SECTION + +//START_SECTION((static void addXTANDEMFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &feature_set))) +//{ +// std::vector< PeptideIdentification > xtandem_pids; +// std::vector< ProteinIdentification > xtandem_pods; + +// IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("xtandem.topperc.idXML"), xtandem_pods, xtandem_pids); +// TopPerc::addXTANDEMFeatures(xtandem_pids, fs); + +// //check completeness of feature construction +// ABORT_IF(check_pepids(xtandem_check_pids, xtandem_pids)); + +// //check registration of percolator features for adapter +// ABORT_IF(check_proids(xtandem_check_pods, xtandem_pods, fs)); +//} +//END_SECTION + +//START_SECTION((static void addCOMETFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &feature_set))) +//{ +// std::vector< PeptideIdentification > comet_pids; +// std::vector< ProteinIdentification > comet_pods; + +// IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); +// TopPerc::addCOMETFeatures(comet_pids, fs); + +// //check completeness of feature construction +// ABORT_IF(!check_pepids(comet_check_pids, comet_pids)); + +// //check registration of percolator features for adapter +// ABORT_IF(!check_proids(comet_check_pods, comet_pods, fs)); +//} +//END_SECTION START_SECTION((static void addMASCOTFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &feature_set))) { @@ -226,7 +242,7 @@ END_SECTION START_SECTION((static void addCONCATSEFeatures(std::vector< PeptideIdentification > &peptide_id_list, StringList &search_engines_used, StringList &feature_set))) { - // TODO + NOT_TESTABLE // actually tested in combination with concatMULTISEPeptideIds } END_SECTION From 726dcfbb1d87ba47e6a0e82369664e0c9e9303fe Mon Sep 17 00:00:00 2001 From: mwalzer Date: Sun, 26 Mar 2017 16:59:00 +0100 Subject: [PATCH 34/41] [TEST] activated rest of the test and moved unused operators in protected scope --- .../include/OpenMS/ANALYSIS/ID/TopPerc.h | 28 +- .../openms/source/TopPerc_test.cpp | 260 +++++++++--------- 2 files changed, 139 insertions(+), 149 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h index 5f891e17335..593b40a5153 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h @@ -159,6 +159,20 @@ namespace OpenMS * checks and removes requested extra Percolator features that are actually inavailable (to compute) */ static void checkExtraFeatures(const std::vector &psms, StringList& extra_features); + + + protected: + TopPerc(); + virtual ~TopPerc(); + + /// Rescales the fragment features to penalize features calculated by few ions, adapted from MSGFtoPercolator + static double rescaleFragmentFeature_(double featureValue, int NumMatchedMainIons); + + /// helper functin for assigning the frequently occurring feature delta score + static void assignDeltaScore_(std::vector& hits, String score_ref, String output_ref); + + /// gets the scan identifer to merge by + static String getScanMergeKey_(std::vector::iterator it, std::vector::iterator start); /// For accession dependent sorting of ProteinHits struct lq_ProteinHit @@ -177,20 +191,6 @@ namespace OpenMS return (h1.getProteinAccession() < h2.getProteinAccession()); } }; - - - protected: - TopPerc(); - virtual ~TopPerc(); - - /// Rescales the fragment features to penalize features calculated by few ions, adapted from MSGFtoPercolator - static double rescaleFragmentFeature_(double featureValue, int NumMatchedMainIons); - - /// helper functin for assigning the frequently occurring feature delta score - static void assignDeltaScore_(std::vector& hits, String score_ref, String output_ref); - - /// gets the scan identifer to merge by - static String getScanMergeKey_(std::vector::iterator it, std::vector::iterator start); }; diff --git a/src/tests/class_tests/openms/source/TopPerc_test.cpp b/src/tests/class_tests/openms/source/TopPerc_test.cpp index 05b4c4d0527..d6bdeee5537 100644 --- a/src/tests/class_tests/openms/source/TopPerc_test.cpp +++ b/src/tests/class_tests/openms/source/TopPerc_test.cpp @@ -104,166 +104,156 @@ IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("xtandem.topperc_check.idXML"), xtand IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("combined.merge.perco.in.idXML"), merge_check_pods, merge_check_pids); IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("combined.concat.perco.in.idXML"), concat_check_pods, concat_check_pids); -PeptideIdentification pid; -StringList fs; - -//START_SECTION((static void concatMULTISEPeptideIds(std::vector< PeptideIdentification > &all_peptide_ids, std::vector< PeptideIdentification > &new_peptide_ids, String search_engine))) -//{ -// std::vector< PeptideIdentification > comet_pids; -// std::vector< ProteinIdentification > comet_pods; -// IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); - -// std::vector< PeptideIdentification > msgf_pids; -// std::vector< ProteinIdentification > msgf_pods; -// IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); - -// StringList ses = ListUtils::create("MS-GF+,Comet"); -// std::vector< PeptideIdentification > concat_pids; -// TopPerc::concatMULTISEPeptideIds(concat_pids, msgf_pids, "MS-GF+"); -// TopPerc::concatMULTISEPeptideIds(concat_pids, comet_pids, "Comet"); -// TopPerc::addCONCATSEFeatures(concat_pids, ses, fs); - -// //check completeness of feature construction -// ABORT_IF(!check_pepids(concat_check_pids, concat_pids)); -//} -//END_SECTION - -//START_SECTION((static void mergeMULTISEPeptideIds(std::vector< PeptideIdentification > &all_peptide_ids, std::vector< PeptideIdentification > &new_peptide_ids, String search_engine))) -//{ -// std::vector< PeptideIdentification > comet_pids; -// std::vector< ProteinIdentification > comet_pods; -// IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); - -// std::vector< PeptideIdentification > msgf_pids; -// std::vector< ProteinIdentification > msgf_pods; -// IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); - -// std::vector< PeptideIdentification > merge_pids; -// StringList ses = ListUtils::create("MS-GF+,Comet"); -// TopPerc::mergeMULTISEPeptideIds(merge_pids, msgf_pids, "MS-GF+"); -// TopPerc::mergeMULTISEPeptideIds(merge_pids, comet_pids, "Comet"); -// StringList empty_extra; -// TopPerc::addMULTISEFeatures(merge_pids, ses, empty_extra, true); -// for (size_t i = merge_pids.size()-1; i > 0; --i) -// { -// TopPerc::checkExtraFeatures(merge_pids[i].getHits(), empty_extra); // also check against empty extra features list and inconsistency removal -// merge_pids.erase(merge_pids.begin()+i); //erase to be able to use completeness check function below -// } -// //check completeness of feature construction -// ABORT_IF(!check_pepids(merge_check_pids, merge_pids)); -//} -//END_SECTION - -//START_SECTION((static void mergeMULTISEProteinIds(std::vector< ProteinIdentification > &all_protein_ids, std::vector< ProteinIdentification > &new_protein_ids))) -//{ -// std::vector< PeptideIdentification > comet_pids; -// std::vector< ProteinIdentification > comet_pods; -// IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); - -// std::vector< PeptideIdentification > msgf_pids; -// std::vector< ProteinIdentification > msgf_pods; -// IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); - -// std::vector< ProteinIdentification > merge_pods; -// TopPerc::mergeMULTISEProteinIds(merge_pods, msgf_pods); -// TopPerc::mergeMULTISEProteinIds(merge_pods, comet_pods); - -// StringList fs; -// std::vector< PeptideIdentification > merge_pids; -// StringList ses = ListUtils::create("MS-GF+,Comet"); -// TopPerc::mergeMULTISEPeptideIds(merge_pids, msgf_pids, "MS-GF+"); -// TopPerc::mergeMULTISEPeptideIds(merge_pids, comet_pids, "Comet"); -// TopPerc::addMULTISEFeatures(merge_pids, ses, fs, true); - -// //check completeness of feature construction -// ABORT_IF(!check_proids(merge_check_pods, merge_pods, fs)); -//} -//END_SECTION - -//START_SECTION((static void addMSGFFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &feature_set))) -//{ -// std::vector< PeptideIdentification > msgf_pids; -// std::vector< ProteinIdentification > msgf_pods; - -// IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); -// TopPerc::addMSGFFeatures(msgf_pids,fs); - -// //check completeness of feature construction -// ABORT_IF(check_pepids(msgf_check_pids, msgf_pids)); - -// //check registration of percolator features for adapter -// ABORT_IF(!check_proids(msgf_check_pods, msgf_pods, fs)); -//} -//END_SECTION - -//START_SECTION((static void addXTANDEMFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &feature_set))) -//{ -// std::vector< PeptideIdentification > xtandem_pids; -// std::vector< ProteinIdentification > xtandem_pods; - -// IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("xtandem.topperc.idXML"), xtandem_pods, xtandem_pids); -// TopPerc::addXTANDEMFeatures(xtandem_pids, fs); - -// //check completeness of feature construction -// ABORT_IF(check_pepids(xtandem_check_pids, xtandem_pids)); - -// //check registration of percolator features for adapter -// ABORT_IF(check_proids(xtandem_check_pods, xtandem_pods, fs)); -//} -//END_SECTION - -//START_SECTION((static void addCOMETFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &feature_set))) -//{ -// std::vector< PeptideIdentification > comet_pids; -// std::vector< ProteinIdentification > comet_pods; - -// IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); -// TopPerc::addCOMETFeatures(comet_pids, fs); - -// //check completeness of feature construction -// ABORT_IF(!check_pepids(comet_check_pids, comet_pids)); - -// //check registration of percolator features for adapter -// ABORT_IF(!check_proids(comet_check_pods, comet_pods, fs)); -//} -//END_SECTION +START_SECTION((static void concatMULTISEPeptideIds(std::vector< PeptideIdentification > &all_peptide_ids, std::vector< PeptideIdentification > &new_peptide_ids, String search_engine))) +{ + StringList fs; + std::vector< PeptideIdentification > comet_pids; + std::vector< ProteinIdentification > comet_pods; + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); + + std::vector< PeptideIdentification > msgf_pids; + std::vector< ProteinIdentification > msgf_pods; + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); + + StringList ses = ListUtils::create("MS-GF+,Comet"); + std::vector< PeptideIdentification > concat_pids; + TopPerc::concatMULTISEPeptideIds(concat_pids, msgf_pids, "MS-GF+"); + TopPerc::concatMULTISEPeptideIds(concat_pids, comet_pids, "Comet"); + TopPerc::addCONCATSEFeatures(concat_pids, ses, fs); + + //check completeness of feature construction + ABORT_IF(!check_pepids(concat_check_pids, concat_pids)); +} +END_SECTION -START_SECTION((static void addMASCOTFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &feature_set))) +START_SECTION((static void mergeMULTISEPeptideIds(std::vector< PeptideIdentification > &all_peptide_ids, std::vector< PeptideIdentification > &new_peptide_ids, String search_engine))) { - NOT_TESTABLE + std::vector< PeptideIdentification > comet_pids; + std::vector< ProteinIdentification > comet_pods; + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); + + std::vector< PeptideIdentification > msgf_pids; + std::vector< ProteinIdentification > msgf_pods; + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); + + std::vector< PeptideIdentification > merge_pids; + StringList ses = ListUtils::create("MS-GF+,Comet"); + TopPerc::mergeMULTISEPeptideIds(merge_pids, msgf_pids, "MS-GF+"); + TopPerc::mergeMULTISEPeptideIds(merge_pids, comet_pids, "Comet"); + StringList empty_extra; + TopPerc::addMULTISEFeatures(merge_pids, ses, empty_extra, true); + TEST_EQUAL(merge_pids.size(),4) + for (size_t i = merge_pids.size()-1; i > 0; --i) + { + TopPerc::checkExtraFeatures(merge_pids[i].getHits(), empty_extra); // also check against empty extra features list and inconsistency removal + merge_pids.erase(merge_pids.begin()+i); //erase to be able to use completeness check function below + } + TEST_EQUAL(merge_pids.size(),1) + //check completeness of feature construction + ABORT_IF(!check_pepids(merge_check_pids, merge_pids)); } END_SECTION -START_SECTION((static void addMULTISEFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &search_engines_used, StringList &feature_set, bool complete_only=true, bool limits_imputation=false))) +START_SECTION((static void mergeMULTISEProteinIds(std::vector< ProteinIdentification > &all_protein_ids, std::vector< ProteinIdentification > &new_protein_ids))) { - // TODO + StringList fs; + std::vector< PeptideIdentification > comet_pids; + std::vector< ProteinIdentification > comet_pods; + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); + + std::vector< PeptideIdentification > msgf_pids; + std::vector< ProteinIdentification > msgf_pods; + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); + + std::vector< ProteinIdentification > merge_pods; + TopPerc::mergeMULTISEProteinIds(merge_pods, msgf_pods); + TopPerc::mergeMULTISEProteinIds(merge_pods, comet_pods); + + std::vector< PeptideIdentification > merge_pids; + StringList ses = ListUtils::create("MS-GF+,Comet"); + TopPerc::mergeMULTISEPeptideIds(merge_pids, msgf_pids, "MS-GF+"); + TopPerc::mergeMULTISEPeptideIds(merge_pids, comet_pids, "Comet"); + TopPerc::addMULTISEFeatures(merge_pids, ses, fs, true); + + //check completeness of feature construction + ABORT_IF(!check_proids(merge_check_pods, merge_pods, fs)); } END_SECTION -START_SECTION((static void addCONCATSEFeatures(std::vector< PeptideIdentification > &peptide_id_list, StringList &search_engines_used, StringList &feature_set))) +START_SECTION((static void addMSGFFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &feature_set))) { - NOT_TESTABLE // actually tested in combination with concatMULTISEPeptideIds + StringList fs; + std::vector< PeptideIdentification > msgf_pids; + std::vector< ProteinIdentification > msgf_pods; + + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); + TopPerc::addMSGFFeatures(msgf_pids,fs); + + //check completeness of feature construction + ABORT_IF(check_pepids(msgf_check_pids, msgf_pids)); + + //check registration of percolator features for adapter + ABORT_IF(!check_proids(msgf_check_pods, msgf_pods, fs)); } END_SECTION -START_SECTION((static void checkExtraFeatures(const std::vector< PeptideHit > &psms, StringList &extra_features))) +START_SECTION((static void addXTANDEMFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &feature_set))) { - NOT_TESTABLE + StringList fs; + std::vector< PeptideIdentification > xtandem_pids; + std::vector< ProteinIdentification > xtandem_pods; + + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("xtandem.topperc.idXML"), xtandem_pods, xtandem_pids); + TopPerc::addXTANDEMFeatures(xtandem_pids, fs); + + //check completeness of feature construction + ABORT_IF(check_pepids(xtandem_check_pids, xtandem_pids)); + + //check registration of percolator features for adapter + ABORT_IF(check_proids(xtandem_check_pods, xtandem_pods, fs)); } END_SECTION -START_SECTION(([TopPerc::lq_PeptideEvidence] bool operator()(const PeptideEvidence &h1, const PeptideEvidence &h2))) +START_SECTION((static void addCOMETFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &feature_set))) { - NOT_TESTABLE + StringList fs; + std::vector< PeptideIdentification > comet_pids; + std::vector< ProteinIdentification > comet_pods; + + IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); + TopPerc::addCOMETFeatures(comet_pids, fs); + + //check completeness of feature construction + ABORT_IF(!check_pepids(comet_check_pids, comet_pids)); + + //check registration of percolator features for adapter + ABORT_IF(!check_proids(comet_check_pods, comet_pods, fs)); } END_SECTION -START_SECTION(([TopPerc::lq_ProteinHit] bool operator()(const ProteinHit &h1, const ProteinHit &h2))) +START_SECTION((static void addMASCOTFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &feature_set))) { - NOT_TESTABLE + NOT_TESTABLE // yet } END_SECTION +START_SECTION((static void addMULTISEFeatures(std::vector< PeptideIdentification > &peptide_ids, StringList &search_engines_used, StringList &feature_set, bool complete_only=true, bool limits_imputation=false))) +{ + NOT_TESTABLE // actually tested in combination with mergeMULTISEPeptideIds +} +END_SECTION + +START_SECTION((static void addCONCATSEFeatures(std::vector< PeptideIdentification > &peptide_id_list, StringList &search_engines_used, StringList &feature_set))) +{ + NOT_TESTABLE // actually tested in combination with concatMULTISEPeptideIds +} +END_SECTION + +START_SECTION((static void checkExtraFeatures(const std::vector< PeptideHit > &psms, StringList &extra_features))) +{ + NOT_TESTABLE // actually tested in combination with mergeMULTISEPeptideIds +} +END_SECTION ///////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////// From bf28aec207c2035d14126422aee07a75c24a63d7 Mon Sep 17 00:00:00 2001 From: mwalzer Date: Sun, 26 Mar 2017 18:10:00 +0100 Subject: [PATCH 35/41] [NOP] typos and docu clarifications --- doc/doxygen/public/TOPP.doxygen | 2 +- doc/doxygen/public/UTILS.doxygen | 2 +- src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/doxygen/public/TOPP.doxygen b/doc/doxygen/public/TOPP.doxygen index 0e369506a3c..5c6429bbe8e 100755 --- a/doc/doxygen/public/TOPP.doxygen +++ b/doc/doxygen/public/TOPP.doxygen @@ -139,7 +139,7 @@ - @subpage TOPP_PeptideIndexer - Refreshes the protein references for all peptide hits. - @subpage TOPP_PhosphoScoring - Scores potential phosphorylation sites in order to localize the most probable sites. - @subpage TOPP_ProteinInference - Infer proteins from a list of (high-confidence) peptides. - - @subpage TOPP_PercolatorAdapter - Percolates the given protein/peptide identifications. + - @subpage TOPP_PercolatorAdapter - Applies the percolator algorithm to protein/peptide identifications. Targeted Experiments - @subpage TOPP_InclusionExclusionListCreator - Creates inclusion and/or exclusion lists for LC-MS/MS experiments. diff --git a/doc/doxygen/public/UTILS.doxygen b/doc/doxygen/public/UTILS.doxygen index 910b9380508..7adfc65bcc6 100644 --- a/doc/doxygen/public/UTILS.doxygen +++ b/doc/doxygen/public/UTILS.doxygen @@ -78,7 +78,7 @@ - @subpage UTILS_RNPxl - Tool for RNP cross linking experiment analysis. - @subpage UTILS_SequenceCoverageCalculator - Prints information about idXML files. - @subpage UTILS_SpecLibCreator - Creates an MSP-formatted spectral library. - - @subpage UTILS_PSMFeatureExtractor - Creates search engine specific featrues for PercolatorAdapter input. + - @subpage UTILS_PSMFeatureExtractor - Creates search engine specific features for PercolatorAdapter input. Quantitation diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h index 593b40a5153..3fc833e1f2e 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h @@ -53,9 +53,9 @@ namespace OpenMS { /** - @brief Percolator feature and I/O handler + @brief Percolator feature set and integration helper - This class contains functions to handle (compute, aggregate) Percolator + This class contains functions to handle (compute, aggregate, integrate) Percolator features. */ @@ -77,7 +77,7 @@ namespace OpenMS * @brief mergeMULTISEPeptideIds * @param all_peptide_ids PeptideIdentification vector to be merged into * @param new_peptide_ids PeptideIdentification vector to merge - * @param search_engine search engine to depend on for feature creation + * @param search_engine search engine to create features from their scores * * Merges a vector of PeptideIdentification into another and registers merge Percolator features depending on given search engine. */ @@ -95,7 +95,7 @@ namespace OpenMS /** * @brief addMSGFFeatures - * @param peptide_ids PeptideIdentification vector to create Percolator featrues in + * @param peptide_ids PeptideIdentification vector to create Percolator features in * @param feature_set register of added features * * Creates and adds MSGF+ specific Percolator features and registers them in feature_set @@ -156,7 +156,7 @@ namespace OpenMS * @param psms the vector of PeptideHit to be checked * @param extra_features the list of requested extra features * - * checks and removes requested extra Percolator features that are actually inavailable (to compute) + * checks and removes requested extra Percolator features that are actually unavailable (to compute) */ static void checkExtraFeatures(const std::vector &psms, StringList& extra_features); From 338a505f4272ca209c9b3cc485fced1117a3d8ef Mon Sep 17 00:00:00 2001 From: mwalzer Date: Sun, 26 Mar 2017 18:23:24 +0100 Subject: [PATCH 36/41] [FIX] added warning, 'incomplete' MSGF PSM is encountered --- src/openms/source/ANALYSIS/ID/TopPerc.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/TopPerc.cpp index 9aa7338c385..fdbdd11dc9c 100644 --- a/src/openms/source/ANALYSIS/ID/TopPerc.cpp +++ b/src/openms/source/ANALYSIS/ID/TopPerc.cpp @@ -116,6 +116,7 @@ namespace OpenMS hit->setMetaValue("MSGF:StdevErrorTop7", stdev_error_top7); } } + else LOG_WARN << "MS-GF+ PSM with missing NumMatchedMainIons skipped." << endl; } } } From cde5e72f5c750b62ff9c3faee4f0cda48d09862c Mon Sep 17 00:00:00 2001 From: mwalzer Date: Sun, 26 Mar 2017 18:35:30 +0100 Subject: [PATCH 37/41] [FIX] fixed merge conflict with knime_package_support.cmake --- cmake/knime_package_support.cmake | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cmake/knime_package_support.cmake b/cmake/knime_package_support.cmake index 161b73615fc..82aadb4643d 100644 --- a/cmake/knime_package_support.cmake +++ b/cmake/knime_package_support.cmake @@ -264,15 +264,11 @@ if(NOT EXISTS ${SEARCH_ENGINES_DIRECTORY}) elseif(NOT EXISTS ${SEARCH_ENGINES_DIRECTORY}/OMSSA OR NOT EXISTS ${SEARCH_ENGINES_DIRECTORY}/XTandem OR NOT EXISTS ${SEARCH_ENGINES_DIRECTORY}/MSGFPlus) message(FATAL_ERROR "The given search engine directory seems to have an invalid layout. ${FOLDER_STRUCTURE_MESSAGE}") elseif(NOT EXISTS ${SEARCH_ENGINES_DIRECTORY}/Fido) -<<<<<<< HEAD message(FATAL_ERROR "The given search engine directory seems to have an invalid layout (Fido is missing). ${FOLDER_STRUCTURE_MESSAGE}") elseif(NOT EXISTS ${SEARCH_ENGINES_DIRECTORY}/LuciPHOr2) message(FATAL_ERROR "The given search engine directory seems to have an invalid layout (LuciPHOr2 is missing). ${FOLDER_STRUCTURE_MESSAGE}") -======= - message(FATAL_ERROR "The given search engine directory seems to have an invalid layout (Fido is missing). Please check use the one from the SVN.") elseif(NOT EXISTS ${SEARCH_ENGINES_DIRECTORY}/Percolator) message(FATAL_ERROR "The given search engine directory seems to have an invalid layout (Percolator is missing). Please check use the one from the SVN.") ->>>>>>> PercolatorAdapter working elseif(NOT APPLE AND NOT EXISTS ${SEARCH_ENGINES_DIRECTORY}/MyriMatch) message(FATAL_ERROR "The given search engine directory seems to have an invalid layout (MyriMatch is missing). ${FOLDER_STRUCTURE_MESSAGE}") endif() From 5ac5dd379ef9498ccf747615f69b0c62510e9212 Mon Sep 17 00:00:00 2001 From: mwalzer Date: Sun, 26 Mar 2017 19:51:10 +0100 Subject: [PATCH 38/41] [RENAME, DOC] renamed TopPerc, some more documentation to its use --- ...TopPerc.h => PercolatorFeatureSetHelper.h} | 15 +++++---- .../include/OpenMS/ANALYSIS/ID/sources.cmake | 2 +- ...erc.cpp => PercolatorFeatureSetHelper.cpp} | 32 +++++++++---------- src/openms/source/ANALYSIS/ID/sources.cmake | 2 +- .../class_tests/openms/executables.cmake | 2 +- ...pp => PercolatorFeatureSetHelper_test.cpp} | 32 +++++++++---------- src/topp/PercolatorAdapter.cpp | 6 ++-- src/utils/PSMFeatureExtractor.cpp | 22 ++++++------- 8 files changed, 58 insertions(+), 55 deletions(-) rename src/openms/include/OpenMS/ANALYSIS/ID/{TopPerc.h => PercolatorFeatureSetHelper.h} (94%) rename src/openms/source/ANALYSIS/ID/{TopPerc.cpp => PercolatorFeatureSetHelper.cpp} (94%) rename src/tests/class_tests/openms/source/{TopPerc_test.cpp => PercolatorFeatureSetHelper_test.cpp} (88%) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h b/src/openms/include/OpenMS/ANALYSIS/ID/PercolatorFeatureSetHelper.h similarity index 94% rename from src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h rename to src/openms/include/OpenMS/ANALYSIS/ID/PercolatorFeatureSetHelper.h index 3fc833e1f2e..f680b252e11 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/TopPerc.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/PercolatorFeatureSetHelper.h @@ -55,11 +55,14 @@ namespace OpenMS /** @brief Percolator feature set and integration helper - This class contains functions to handle (compute, aggregate, integrate) Percolator - features. + This class contains functions to handle (compute, aggregate, integrate) + Percolator features. This includes the calculation or extraction of + Percolator features for the specific search engine usage, preparation for + PercolatorApater usage and result reintegration and in the case of + multiple search engine incorporation of different features. */ - class OPENMS_DLLAPI TopPerc + class OPENMS_DLLAPI PercolatorFeatureSetHelper { public: @@ -162,8 +165,8 @@ namespace OpenMS protected: - TopPerc(); - virtual ~TopPerc(); + PercolatorFeatureSetHelper(); + virtual ~PercolatorFeatureSetHelper(); /// Rescales the fragment features to penalize features calculated by few ions, adapted from MSGFtoPercolator static double rescaleFragmentFeature_(double featureValue, int NumMatchedMainIons); @@ -196,5 +199,5 @@ namespace OpenMS } //namespace OpenMS -#endif //OPENMS_ANALYSIS_ID_TOPPERC_H +#endif //OPENMS_ANALYSIS_ID_PERCOLATORFEATURESETHELPER_H diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake b/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake index 6c25f4b381c..178844e4e54 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake +++ b/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake @@ -23,7 +23,7 @@ MetaboliteSpectralMatching.h PeptideProteinResolution.h ProtonDistributionModel.h PeptideIndexing.h -TopPerc.h +PercolatorFeatureSetHelper.h ) ### add path to the filenames diff --git a/src/openms/source/ANALYSIS/ID/TopPerc.cpp b/src/openms/source/ANALYSIS/ID/PercolatorFeatureSetHelper.cpp similarity index 94% rename from src/openms/source/ANALYSIS/ID/TopPerc.cpp rename to src/openms/source/ANALYSIS/ID/PercolatorFeatureSetHelper.cpp index fdbdd11dc9c..ddf998bf0aa 100644 --- a/src/openms/source/ANALYSIS/ID/TopPerc.cpp +++ b/src/openms/source/ANALYSIS/ID/PercolatorFeatureSetHelper.cpp @@ -33,13 +33,13 @@ // -------------------------------------------------------------------------- #include -#include +#include using namespace std; namespace OpenMS { - void TopPerc::addMSGFFeatures(vector& peptide_ids, StringList& feature_set) + void PercolatorFeatureSetHelper::addMSGFFeatures(vector& peptide_ids, StringList& feature_set) { feature_set.push_back("MS:1002049"); // unchanged RawScore feature_set.push_back("MS:1002050"); // unchanged DeNovoScore @@ -121,7 +121,7 @@ namespace OpenMS } } - void TopPerc::addXTANDEMFeatures(vector& peptide_ids, StringList& feature_set) + void PercolatorFeatureSetHelper::addXTANDEMFeatures(vector& peptide_ids, StringList& feature_set) { // Find out which ions are in XTandem-File and take only these as features StringList ion_types = ListUtils::create("a,b,c,x,y,z"); @@ -162,7 +162,7 @@ namespace OpenMS } } - void TopPerc::addCOMETFeatures(vector& peptide_ids, StringList& feature_set) + void PercolatorFeatureSetHelper::addCOMETFeatures(vector& peptide_ids, StringList& feature_set) { feature_set.push_back("COMET:deltCn"); // recalculated deltCn = (current_XCorr - 2nd_best_XCorr) / max(current_XCorr, 1) feature_set.push_back("COMET:deltLCn"); // deltLCn = (current_XCorr - worst_XCorr) / max(current_XCorr, 1) @@ -245,7 +245,7 @@ namespace OpenMS 17. seqCov Sequence coverage of matched ions (per ion series). Not available in mascot adapter. 18. intMatched Matched ion intensity (per ion series). Not available in mascot adapter. */ - void TopPerc::addMASCOTFeatures(vector& peptide_ids, StringList& feature_set) + void PercolatorFeatureSetHelper::addMASCOTFeatures(vector& peptide_ids, StringList& feature_set) { feature_set.push_back("MS:1001171"); // unchanged mScore feature_set.push_back("MASCOT:delta_score"); // delta score based on mScore @@ -269,7 +269,7 @@ namespace OpenMS } } - void TopPerc::addCONCATSEFeatures(vector& peptide_ids, StringList& search_engines_used, StringList& feature_set) + void PercolatorFeatureSetHelper::addCONCATSEFeatures(vector& peptide_ids, StringList& search_engines_used, StringList& feature_set) { for (StringList::iterator it = search_engines_used.begin(); it != search_engines_used.end(); ++it) { feature_set.push_back("CONCAT:" + *it); @@ -287,7 +287,7 @@ namespace OpenMS } } - void TopPerc::mergeMULTISEPeptideIds(vector& all_peptide_ids, vector& new_peptide_ids, String search_engine) + void PercolatorFeatureSetHelper::mergeMULTISEPeptideIds(vector& all_peptide_ids, vector& new_peptide_ids, String search_engine) { LOG_DEBUG << "creating spectrum map" << endl; @@ -419,7 +419,7 @@ namespace OpenMS } // references from PeptideHits to ProteinHits work with the protein accessions, so no need to update the PeptideHits - void TopPerc::mergeMULTISEProteinIds(vector& all_protein_ids, vector& new_protein_ids) + void PercolatorFeatureSetHelper::mergeMULTISEProteinIds(vector& all_protein_ids, vector& new_protein_ids) { LOG_DEBUG << "merging search parameters" << endl; @@ -444,7 +444,7 @@ namespace OpenMS std::vector& new_protein_hits = new_protein_ids.front().getHits(); LOG_DEBUG << "Sorting " << new_protein_hits.size() << " new ProteinHits." << endl; - std::sort(new_protein_hits.begin(), new_protein_hits.end(), TopPerc::lq_ProteinHit()); + std::sort(new_protein_hits.begin(), new_protein_hits.end(), PercolatorFeatureSetHelper::lq_ProteinHit()); LOG_DEBUG << "Melting with " << all_protein_hits.size() << " previous ProteinHits." << endl; if (all_protein_hits.empty()) @@ -457,7 +457,7 @@ namespace OpenMS std::vector::iterator uni = set_union( all_protein_hits.begin(), all_protein_hits.end(), new_protein_hits.begin(), new_protein_hits.end(), tmp_protein_hits.begin(), - TopPerc::lq_ProteinHit() ); + PercolatorFeatureSetHelper::lq_ProteinHit() ); tmp_protein_hits.resize(uni - tmp_protein_hits.begin()); all_protein_hits.swap(tmp_protein_hits); } @@ -509,7 +509,7 @@ namespace OpenMS LOG_DEBUG << "Merging for this file finished." << endl; } - void TopPerc::concatMULTISEPeptideIds(vector& all_peptide_ids, vector& new_peptide_ids, String search_engine) + void PercolatorFeatureSetHelper::concatMULTISEPeptideIds(vector& all_peptide_ids, vector& new_peptide_ids, String search_engine) { for (vector::iterator pit = new_peptide_ids.begin(); pit != new_peptide_ids.end(); ++pit) { @@ -542,7 +542,7 @@ namespace OpenMS all_peptide_ids.insert(all_peptide_ids.end(), new_peptide_ids.begin(), new_peptide_ids.end()); } - void TopPerc::addMULTISEFeatures(vector& peptide_ids, StringList& search_engines_used, StringList& feature_set, bool complete_only, bool limits_imputation) + void PercolatorFeatureSetHelper::addMULTISEFeatures(vector& peptide_ids, StringList& search_engines_used, StringList& feature_set, bool complete_only, bool limits_imputation) { map > extremals; // will have as keys the below SE cv terms vector max_better, min_better; @@ -709,7 +709,7 @@ namespace OpenMS } } - void TopPerc::checkExtraFeatures(const vector& psms, StringList& extra_features) + void PercolatorFeatureSetHelper::checkExtraFeatures(const vector& psms, StringList& extra_features) { set unavail; for (vector::const_iterator hit = psms.begin(); hit != psms.end(); ++hit) @@ -731,7 +731,7 @@ namespace OpenMS // Function adapted from MsgfplusReader in Percolator converter - double TopPerc::rescaleFragmentFeature_(double featureValue, int NumMatchedMainIons) + double PercolatorFeatureSetHelper::rescaleFragmentFeature_(double featureValue, int NumMatchedMainIons) { // Rescale the fragment features to penalize features calculated by few ions int numMatchedIonLimit = 7; @@ -740,7 +740,7 @@ namespace OpenMS return featureValue * ((double)numerator / denominator); } - void TopPerc::assignDeltaScore_(vector& hits, String score_ref, String output_ref) + void PercolatorFeatureSetHelper::assignDeltaScore_(vector& hits, String score_ref, String output_ref) { if (!hits.empty()) { @@ -758,7 +758,7 @@ namespace OpenMS } // TODO: this is code redundancy to PercolatorAdapter - String TopPerc::getScanMergeKey_(vector::iterator it, vector::iterator start) + String PercolatorFeatureSetHelper::getScanMergeKey_(vector::iterator it, vector::iterator start) { // MSGF+ uses this field, is empty if not specified String scan_identifier = it->getMetaValue("spectrum_reference"); diff --git a/src/openms/source/ANALYSIS/ID/sources.cmake b/src/openms/source/ANALYSIS/ID/sources.cmake index 6aa2d36ebb3..defacb14ccc 100644 --- a/src/openms/source/ANALYSIS/ID/sources.cmake +++ b/src/openms/source/ANALYSIS/ID/sources.cmake @@ -23,7 +23,7 @@ MetaboliteSpectralMatching.cpp PeptideProteinResolution.cpp ProtonDistributionModel.cpp PeptideIndexing.cpp -TopPerc.cpp +PercolatorFeatureSetHelper.cpp ) ### add path to the filenames diff --git a/src/tests/class_tests/openms/executables.cmake b/src/tests/class_tests/openms/executables.cmake index 8d127c5d2b1..764d455863f 100644 --- a/src/tests/class_tests/openms/executables.cmake +++ b/src/tests/class_tests/openms/executables.cmake @@ -485,7 +485,7 @@ set(analysis_executables_list SimpleSVM_test StablePairFinder_test #TargetedExperimentHelper_test - TopPerc_test + PercolatorFeatureSetHelper_test TransformationDescription_test TransformationModel_test TransformationModelBSpline_test diff --git a/src/tests/class_tests/openms/source/TopPerc_test.cpp b/src/tests/class_tests/openms/source/PercolatorFeatureSetHelper_test.cpp similarity index 88% rename from src/tests/class_tests/openms/source/TopPerc_test.cpp rename to src/tests/class_tests/openms/source/PercolatorFeatureSetHelper_test.cpp index d6bdeee5537..5cbd0393597 100644 --- a/src/tests/class_tests/openms/source/TopPerc_test.cpp +++ b/src/tests/class_tests/openms/source/PercolatorFeatureSetHelper_test.cpp @@ -38,7 +38,7 @@ #include /////////////////////////// -#include +#include /////////////////////////// using namespace OpenMS; @@ -117,9 +117,9 @@ START_SECTION((static void concatMULTISEPeptideIds(std::vector< PeptideIdentific StringList ses = ListUtils::create("MS-GF+,Comet"); std::vector< PeptideIdentification > concat_pids; - TopPerc::concatMULTISEPeptideIds(concat_pids, msgf_pids, "MS-GF+"); - TopPerc::concatMULTISEPeptideIds(concat_pids, comet_pids, "Comet"); - TopPerc::addCONCATSEFeatures(concat_pids, ses, fs); + PercolatorFeatureSetHelper::concatMULTISEPeptideIds(concat_pids, msgf_pids, "MS-GF+"); + PercolatorFeatureSetHelper::concatMULTISEPeptideIds(concat_pids, comet_pids, "Comet"); + PercolatorFeatureSetHelper::addCONCATSEFeatures(concat_pids, ses, fs); //check completeness of feature construction ABORT_IF(!check_pepids(concat_check_pids, concat_pids)); @@ -138,14 +138,14 @@ START_SECTION((static void mergeMULTISEPeptideIds(std::vector< PeptideIdentifica std::vector< PeptideIdentification > merge_pids; StringList ses = ListUtils::create("MS-GF+,Comet"); - TopPerc::mergeMULTISEPeptideIds(merge_pids, msgf_pids, "MS-GF+"); - TopPerc::mergeMULTISEPeptideIds(merge_pids, comet_pids, "Comet"); + PercolatorFeatureSetHelper::mergeMULTISEPeptideIds(merge_pids, msgf_pids, "MS-GF+"); + PercolatorFeatureSetHelper::mergeMULTISEPeptideIds(merge_pids, comet_pids, "Comet"); StringList empty_extra; - TopPerc::addMULTISEFeatures(merge_pids, ses, empty_extra, true); + PercolatorFeatureSetHelper::addMULTISEFeatures(merge_pids, ses, empty_extra, true); TEST_EQUAL(merge_pids.size(),4) for (size_t i = merge_pids.size()-1; i > 0; --i) { - TopPerc::checkExtraFeatures(merge_pids[i].getHits(), empty_extra); // also check against empty extra features list and inconsistency removal + PercolatorFeatureSetHelper::checkExtraFeatures(merge_pids[i].getHits(), empty_extra); // also check against empty extra features list and inconsistency removal merge_pids.erase(merge_pids.begin()+i); //erase to be able to use completeness check function below } TEST_EQUAL(merge_pids.size(),1) @@ -166,14 +166,14 @@ START_SECTION((static void mergeMULTISEProteinIds(std::vector< ProteinIdentifica IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); std::vector< ProteinIdentification > merge_pods; - TopPerc::mergeMULTISEProteinIds(merge_pods, msgf_pods); - TopPerc::mergeMULTISEProteinIds(merge_pods, comet_pods); + PercolatorFeatureSetHelper::mergeMULTISEProteinIds(merge_pods, msgf_pods); + PercolatorFeatureSetHelper::mergeMULTISEProteinIds(merge_pods, comet_pods); std::vector< PeptideIdentification > merge_pids; StringList ses = ListUtils::create("MS-GF+,Comet"); - TopPerc::mergeMULTISEPeptideIds(merge_pids, msgf_pids, "MS-GF+"); - TopPerc::mergeMULTISEPeptideIds(merge_pids, comet_pids, "Comet"); - TopPerc::addMULTISEFeatures(merge_pids, ses, fs, true); + PercolatorFeatureSetHelper::mergeMULTISEPeptideIds(merge_pids, msgf_pids, "MS-GF+"); + PercolatorFeatureSetHelper::mergeMULTISEPeptideIds(merge_pids, comet_pids, "Comet"); + PercolatorFeatureSetHelper::addMULTISEFeatures(merge_pids, ses, fs, true); //check completeness of feature construction ABORT_IF(!check_proids(merge_check_pods, merge_pods, fs)); @@ -187,7 +187,7 @@ START_SECTION((static void addMSGFFeatures(std::vector< PeptideIdentification > std::vector< ProteinIdentification > msgf_pods; IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("msgf.topperc.idXML"), msgf_pods, msgf_pids); - TopPerc::addMSGFFeatures(msgf_pids,fs); + PercolatorFeatureSetHelper::addMSGFFeatures(msgf_pids,fs); //check completeness of feature construction ABORT_IF(check_pepids(msgf_check_pids, msgf_pids)); @@ -204,7 +204,7 @@ START_SECTION((static void addXTANDEMFeatures(std::vector< PeptideIdentification std::vector< ProteinIdentification > xtandem_pods; IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("xtandem.topperc.idXML"), xtandem_pods, xtandem_pids); - TopPerc::addXTANDEMFeatures(xtandem_pids, fs); + PercolatorFeatureSetHelper::addXTANDEMFeatures(xtandem_pids, fs); //check completeness of feature construction ABORT_IF(check_pepids(xtandem_check_pids, xtandem_pids)); @@ -221,7 +221,7 @@ START_SECTION((static void addCOMETFeatures(std::vector< PeptideIdentification > std::vector< ProteinIdentification > comet_pods; IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("comet.topperc.idXML"), comet_pods, comet_pids); - TopPerc::addCOMETFeatures(comet_pids, fs); + PercolatorFeatureSetHelper::addCOMETFeatures(comet_pids, fs); //check completeness of feature construction ABORT_IF(!check_pepids(comet_check_pids, comet_pids)); diff --git a/src/topp/PercolatorAdapter.cpp b/src/topp/PercolatorAdapter.cpp index 844bf764e27..e13c6f08191 100644 --- a/src/topp/PercolatorAdapter.cpp +++ b/src/topp/PercolatorAdapter.cpp @@ -42,7 +42,7 @@ #include #include #include -#include +#include #include #include #include @@ -284,7 +284,7 @@ class PercolatorAdapter : scan_number = it->substr(idx + 5).toInt(); break; } - else if ((idx = it->find("index=")) != string::npos) + else if ((idx = it->hasPrefix("index=")) != string::npos) { scan_number = it->substr(idx + 6).toInt(); } @@ -627,7 +627,7 @@ class PercolatorAdapter : LOG_INFO << "Merging peptide ids." << endl; all_peptide_ids.insert(all_peptide_ids.end(), peptide_ids.begin(), peptide_ids.end()); LOG_INFO << "Merging protein ids." << endl; - TopPerc::mergeMULTISEProteinIds(all_protein_ids, protein_ids); + PercolatorFeatureSetHelper::mergeMULTISEProteinIds(all_protein_ids, protein_ids); } return EXECUTION_OK; } diff --git a/src/utils/PSMFeatureExtractor.cpp b/src/utils/PSMFeatureExtractor.cpp index c473885a0f2..a62c8379d2f 100644 --- a/src/utils/PSMFeatureExtractor.cpp +++ b/src/utils/PSMFeatureExtractor.cpp @@ -41,7 +41,7 @@ #include #include #include -#include +#include #include #include #include @@ -208,15 +208,15 @@ class PSMFeatureExtractor : if (concatenate) { // will concatenate the list - TopPerc::concatMULTISEPeptideIds(all_peptide_ids, peptide_ids, search_engine); + PercolatorFeatureSetHelper::concatMULTISEPeptideIds(all_peptide_ids, peptide_ids, search_engine); } else { // will collapse the list (reference) - TopPerc::mergeMULTISEPeptideIds(all_peptide_ids, peptide_ids, search_engine); + PercolatorFeatureSetHelper::mergeMULTISEPeptideIds(all_peptide_ids, peptide_ids, search_engine); } } - TopPerc::mergeMULTISEProteinIds(all_protein_ids, protein_ids); + PercolatorFeatureSetHelper::mergeMULTISEProteinIds(all_protein_ids, protein_ids); } if (all_protein_ids.empty()) @@ -240,19 +240,19 @@ class PSMFeatureExtractor : { if (getFlag_("concat")) { - TopPerc::addCONCATSEFeatures(all_peptide_ids, search_engines_used, feature_set); + PercolatorFeatureSetHelper::addCONCATSEFeatures(all_peptide_ids, search_engines_used, feature_set); } else { bool impute = getFlag_("impute"); bool limits = getFlag_("limit_imputation"); - TopPerc::addMULTISEFeatures(all_peptide_ids, search_engines_used, feature_set, !impute, limits); + PercolatorFeatureSetHelper::addMULTISEFeatures(all_peptide_ids, search_engines_used, feature_set, !impute, limits); } } - else if (search_engine == "MS-GF+") TopPerc::addMSGFFeatures(all_peptide_ids, feature_set); - else if (search_engine == "Mascot") TopPerc::addMASCOTFeatures(all_peptide_ids, feature_set); - else if (search_engine == "XTandem") TopPerc::addXTANDEMFeatures(all_peptide_ids, feature_set); - else if (search_engine == "Comet") TopPerc::addCOMETFeatures(all_peptide_ids, feature_set); + else if (search_engine == "MS-GF+") PercolatorFeatureSetHelper::addMSGFFeatures(all_peptide_ids, feature_set); + else if (search_engine == "Mascot") PercolatorFeatureSetHelper::addMASCOTFeatures(all_peptide_ids, feature_set); + else if (search_engine == "XTandem") PercolatorFeatureSetHelper::addXTANDEMFeatures(all_peptide_ids, feature_set); + else if (search_engine == "Comet") PercolatorFeatureSetHelper::addCOMETFeatures(all_peptide_ids, feature_set); else { writeLog_("No known input to create PSM features from. Aborting"); @@ -263,7 +263,7 @@ class PSMFeatureExtractor : for (vector::iterator it = all_peptide_ids.begin(); it != all_peptide_ids.end(); ++it) { it->setIdentifier(run_identifier); - TopPerc::checkExtraFeatures(it->getHits(), extra_features); // will remove inconsistently available features + PercolatorFeatureSetHelper::checkExtraFeatures(it->getHits(), extra_features); // will remove inconsistently available features } // TODO: There should only be 1 ProteinIdentification element in this vector, no need for a for loop From f39a28e07ddd250112b18cd999faea66bb4cb9e1 Mon Sep 17 00:00:00 2001 From: mwalzer Date: Mon, 27 Mar 2017 20:34:25 +0100 Subject: [PATCH 39/41] [NOP,FIX] corrected some style issues; replaced the other find with hasPrefix in getScanNumber_ of PercolatorAdapter --- .../ANALYSIS/ID/PercolatorFeatureSetHelper.cpp | 12 ++++++------ src/topp/PercolatorAdapter.cpp | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/openms/source/ANALYSIS/ID/PercolatorFeatureSetHelper.cpp b/src/openms/source/ANALYSIS/ID/PercolatorFeatureSetHelper.cpp index ddf998bf0aa..659b335c36b 100644 --- a/src/openms/source/ANALYSIS/ID/PercolatorFeatureSetHelper.cpp +++ b/src/openms/source/ANALYSIS/ID/PercolatorFeatureSetHelper.cpp @@ -198,14 +198,14 @@ namespace OpenMS double ln_num_sp; if (hit->metaValueExists("num_matched_peptides")) - { - double num_sp = hit->getMetaValue("num_matched_peptides").toString().toDouble(); + { + double num_sp = hit->getMetaValue("num_matched_peptides").toString().toDouble(); ln_num_sp = log(max(1.0, num_sp)); // if recorded, one can be safely assumed - } + } else // fallback - { - ln_num_sp = hit->getMetaValue("MS:1002255").toString().toDouble(); - } + { + ln_num_sp = hit->getMetaValue("MS:1002255").toString().toDouble(); + } double ln_rank_sp = log(max(1.0, hit->getMetaValue("MS:1002256").toString().toDouble())); hit->setMetaValue("COMET:lnNumSP", ln_num_sp); hit->setMetaValue("COMET:lnRankSP", ln_rank_sp); diff --git a/src/topp/PercolatorAdapter.cpp b/src/topp/PercolatorAdapter.cpp index e13c6f08191..dab65716a11 100644 --- a/src/topp/PercolatorAdapter.cpp +++ b/src/topp/PercolatorAdapter.cpp @@ -135,7 +135,7 @@ class PercolatorAdapter : { } - PercolatorResult(StringList& row): + explicit PercolatorResult(StringList& row): proteinIds() { // peptide sequence @@ -279,7 +279,7 @@ class PercolatorAdapter : { // if scan number is not available, use the scan index Size idx = 0; - if ((idx = it->find("scan=")) != string::npos) + if ((idx = it->hasPrefix("scan=")) != string::npos) { scan_number = it->substr(idx + 5).toInt(); break; From 83194048c1b5514b3b24132750a1fd35ed41b220 Mon Sep 17 00:00:00 2001 From: Leon Bichmann Date: Mon, 27 Mar 2017 21:47:45 +0200 Subject: [PATCH 40/41] Julianus changes and pin file option --- src/openms/include/OpenMS/FORMAT/FileTypes.h | 1 + .../source/APPLICATIONS/ToolHandler.cpp | 1 + src/openms/source/FORMAT/FileTypes.cpp | 1 + src/utils/COMETAdapter.cpp | 26 ++++++------------- 4 files changed, 11 insertions(+), 18 deletions(-) diff --git a/src/openms/include/OpenMS/FORMAT/FileTypes.h b/src/openms/include/OpenMS/FORMAT/FileTypes.h index b162f8e55e5..d8d515ae61b 100644 --- a/src/openms/include/OpenMS/FORMAT/FileTypes.h +++ b/src/openms/include/OpenMS/FORMAT/FileTypes.h @@ -101,6 +101,7 @@ namespace OpenMS PSQ, ///< NCBI binary blast db MRM, ///< SpectraST MRM List PSMS, ///< Percolator tab-delimited output (PSM level) + PIN, ///< Percolator tab-delimited input (PSM level) SIZE_OF_TYPE ///< No file type. Simply stores the number of types }; diff --git a/src/openms/source/APPLICATIONS/ToolHandler.cpp b/src/openms/source/APPLICATIONS/ToolHandler.cpp index 2b7a3eea400..3162e6023a7 100755 --- a/src/openms/source/APPLICATIONS/ToolHandler.cpp +++ b/src/openms/source/APPLICATIONS/ToolHandler.cpp @@ -56,6 +56,7 @@ namespace OpenMS tools_map["ConsensusMapNormalizer"] = Internal::ToolDescription("ConsensusMapNormalizer", "Map Alignment"); tools_map["ConvertTraMLToTSV"] = Internal::ToolDescription("ConvertTraMLToTSV", "Targeted Experiments"); tools_map["ConvertTSVToTraML"] = Internal::ToolDescription("ConvertTSVToTraML", "Targeted Experiments"); + tools_map["COMETAdapter"] = Internal::ToolDescription("COMETAdapter", "Identification"); tools_map["Decharger"] = Internal::ToolDescription("Decharger", "Quantitation"); tools_map["DTAExtractor"] = Internal::ToolDescription("DTAExtractor", "File Handling"); tools_map["EICExtractor"] = Internal::ToolDescription("EICExtractor", "Quantitation"); diff --git a/src/openms/source/FORMAT/FileTypes.cpp b/src/openms/source/FORMAT/FileTypes.cpp index 22b2775c5b1..39289184667 100644 --- a/src/openms/source/FORMAT/FileTypes.cpp +++ b/src/openms/source/FORMAT/FileTypes.cpp @@ -130,6 +130,7 @@ namespace OpenMS targetMap[FileTypes::PSQ] = "psq"; targetMap[FileTypes::MRM] = "mrm"; targetMap[FileTypes::PSMS] = "psms"; + targetMap[FileTypes::PIN] = "pin"; return targetMap; } diff --git a/src/utils/COMETAdapter.cpp b/src/utils/COMETAdapter.cpp index 015bdc730ad..193c930b298 100755 --- a/src/utils/COMETAdapter.cpp +++ b/src/utils/COMETAdapter.cpp @@ -123,16 +123,14 @@ class TOPPCOMETAdapter : setValidFormats_("in", ListUtils::create("mzML")); registerOutputFile_("out", "", "", "Output file"); setValidFormats_("out", ListUtils::create("idXML")); + registerOutputFile_("pin_out", "", "", "Output file - for percolator input"); + setValidFormats_("pin_out", ListUtils::create("pin"), false); registerInputFile_("database", "", "", "FASTA file or pro file. Non-existing relative file-names are looked up via'OpenMS.ini:id_db_dir'", true, false, ListUtils::create("skipexists")); setValidFormats_("database", ListUtils::create("FASTA")); registerInputFile_("comet_executable", "", // choose the default value according to the platform where it will be executed // COMET compiles as tandem on OSX and tandem.exe on any other platform -#if defined(__APPLE__) - "comet.exe", -#else - "comet.exe", -#endif + "comet.exe", "Comet executable of the installation e.g. 'comet.exe'", true, false, ListUtils::create("skipexists")); addEmptyLine_(); @@ -141,7 +139,6 @@ class TOPPCOMETAdapter : // registerDoubleOption_("peptide_mass_tolerance", "", 10.0, "peptide_mass_tolerance", false); - registerDoubleOption_("precursor_mass_tolerance", "", 10.0, "Precursor mass tolerance", false); registerDoubleOption_("fragment_mass_tolerance", "", 0.3, "Fragment mass error", false); registerIntOption_("peptide_mass_units", "", 2, "0=amu, 1=mmu, 2=ppm", false); @@ -159,16 +156,9 @@ class TOPPCOMETAdapter : registerStringOption_("decoy_prefix", "", "rev_", "decoy entries are denoted by this string which is pre-pended to each protein accession", false); - //setValidStrings_("precursor_error_units", valid_strings); - //setValidStrings_("fragment_error_units", valid_strings); - - //registerIntOption_("min_precursor_charge", "", 2, "Minimum precursor charge", false); - //registerIntOption_("max_precursor_charge", "", 4, "Maximum precursor charge", false); - //registerStringList_("fixed_modifications", "", ListUtils::create(""), "Fixed modifications, specified using UniMod (www.unimod.org) terms, e.g. 'Carbamidomethyl (C)' or 'Oxidation (M)'", false); vector all_mods; ModificationsDB::getInstance()->getAllSearchModifications(all_mods); - //setValidStrings_("fixed_modifications", all_mods); - registerStringList_("variable_modifications", "", ListUtils::create(""), "Variable modifications, specified using UniMod (www.unimod.org) terms, e.g. 'Carbamidomethyl (C)' or 'Oxidation (M)'", false); + registerStringList_("variable_modifications", "", ListUtils::create(""), "Variable modifications, specified using UniMod (www.unimod.org) terms, e.g. 'Carbamidomethyl (C)' or 'Oxidation (M)'", false); setValidStrings_("variable_modifications", all_mods); addEmptyLine_(); @@ -247,7 +237,7 @@ class TOPPCOMETAdapter : } else { - throw OpenMS::Exception::IllegalArgument(__FILE__, __LINE__, __FUNCTION__, "Error: Enzyme not supported. " + enzyme_name); + throw OpenMS::Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Error: Enzyme not supported. " + enzyme_name); } os << "search_enzyme_number = " << enzyme_number << "\n"; // choose from list at end of this params file @@ -261,7 +251,7 @@ class TOPPCOMETAdapter : vector variable_modifications = getModifications_(variable_modifications_names); if (variable_modifications.size() > 9) { - throw OpenMS::Exception::IllegalArgument(__FILE__, __LINE__, __FUNCTION__, "Error: Comet only supports 9 variable modifications. " + String(variable_modifications.size()) + " provided."); + throw OpenMS::Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Error: Comet only supports 9 variable modifications. " + String(variable_modifications.size()) + " provided."); } Size var_mod_index = 1; @@ -330,7 +320,8 @@ class TOPPCOMETAdapter : os << "output_sqtfile = " << 0 << "\n"; // 0=no, 1=yes write sqt file os << "output_txtfile = " << 0 << "\n"; // 0=no, 1=yes write tab-delimited txt file os << "output_pepxmlfile = " << 1 << "\n"; // 0=no, 1=yes write pep.xml file - os << "output_percolatorfile = " << 1 << "\n"; // 0=no, 1=yes write Percolator tab-delimited input file + + os << "output_percolatorfile = " << !getStringOption_("pin_out").empty() << "\n"; // 0=no, 1=yes write Percolator tab-delimited input file os << "output_outfiles = " << 0 << "\n"; // 0=no, 1=yes write .out files os << "print_expect_score = " << 1 << "\n"; // 0=no, 1=yes to replace Sp with expect in out & sqt os << "num_output_lines = " << 5 << "\n"; // num peptide results to show @@ -532,7 +523,6 @@ class TOPPCOMETAdapter : if (this->debug_level_ == 0) { File::remove(tmp_pepxml); - File::remove(tmp_pin); LOG_WARN << "Set debug level to >0 to keep the temporary pep.xml and pin files at '" << tmp_pepxml << "'" << std::endl; } else From 5f1ebf5977eb131c68698dfe342cef1759226176 Mon Sep 17 00:00:00 2001 From: Leon Bichmann Date: Tue, 28 Mar 2017 12:32:23 +0200 Subject: [PATCH 41/41] PercolatorAdapter has been changed by Timo reverted to proper parsing of native idstring. Prefix was not right for parsing + a couple of couts for debugging --- .DS_Store | Bin 0 -> 6148 bytes src/.DS_Store | Bin 0 -> 6148 bytes src/openms/.DS_Store | Bin 0 -> 6148 bytes src/topp/PercolatorAdapter.cpp | 7 ++++--- 4 files changed, 4 insertions(+), 3 deletions(-) create mode 100644 .DS_Store create mode 100644 src/.DS_Store create mode 100644 src/openms/.DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..9e54da52323fd7059fcdfeff422512f4d4181b45 GIT binary patch literal 6148 zcmeHK%TB{U3>=dpRqCb39`_gcgH=_&pg&MVt8%CsDFx2C@!O0apr#2|4q!|6WWAnU zWe#x+K-O>j2VejkH4fbg9{JtIV?{UEk zp3vcNv(s>=o2Y-~DJ@R8+|*CsoY_<$6-WhAfmGmsD!`ttHeWbqOa)SbRNz|y{XP`B zVh!va?bE?vBLGo<;B4M`mc^RH9QQMhtN}^CjwPVDIR3NDLnmH%m+? zVz=}BV&#zPm@ySd1%?VVzFcbmf298~{|`yIN(EAZKc#@oHrvgbuN1v?^m5v33w=lb qGS*r-gSBF!wPG%`6|b)9iav9{2KJ6dXWr<<{1H%H(o%unP~a1Emm#14 literal 0 HcmV?d00001 diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..4bd2efc1187a9b7ac28443b3169053b67e6c42b6 GIT binary patch literal 6148 zcmeHK+e*Vg5Iv)@6h-KxkNXS$K`8YL`T<(7)Iv>4WArtj?wOrwN!mR5AR;rcb9U#J z?1AiN0LZ+3eFSCzrW}f*lMzw#sO!puCq&6L-qCz657n-&HY0`pqDkLl)8Gp$eBh0G$JqKT@`k=O8tOhzTBD#%-(r7}(bBVJwet)n zg#-h^Krj#t1Oq={K<^#O^v*DJFc1s`11ARbd`KLMrDJE9j}EHb0uT$Dvv97xgytm2 z(y=q7ha#3rv{dmHLoA*4ze|wS0%S{$K}KjLO35fKdBrGE&UnqdmK=joj^ZbCpFFwb z4W;<$j8{)bX+e!rKnk2Ha2eB?_5XYNFZch`BF&_L6!=#Pl*wYbnDdpgw{|aQy|&UH r=s(6>Pv_vRnDkc6iM8U}qr7U*TyM!S=;e%iIjJ83(?uo){z8Fo@-`fY literal 0 HcmV?d00001 diff --git a/src/topp/PercolatorAdapter.cpp b/src/topp/PercolatorAdapter.cpp index dab65716a11..c0fd9f487fd 100644 --- a/src/topp/PercolatorAdapter.cpp +++ b/src/topp/PercolatorAdapter.cpp @@ -279,12 +279,12 @@ class PercolatorAdapter : { // if scan number is not available, use the scan index Size idx = 0; - if ((idx = it->hasPrefix("scan=")) != string::npos) + if ((idx = it->find("scan=")) != string::npos) { scan_number = it->substr(idx + 5).toInt(); break; } - else if ((idx = it->hasPrefix("index=")) != string::npos) + else if ((idx = it->find("index=")) != string::npos) { scan_number = it->substr(idx + 6).toInt(); } @@ -371,6 +371,7 @@ class PercolatorAdapter : //id label scannr calcmass expmass feature1 ... featureN peptide proteinId1 .. proteinIdM void preparePin_(vector& peptide_ids, StringList& feature_set, std::string& enz, TextFile& txt, int min_charge, int max_charge) { + std::cout << "number of features = " << feature_set.size() << "\n"; for (vector::iterator it = peptide_ids.begin(); it != peptide_ids.end(); ++it) { String scan_identifier = getScanIdentifier_(it, peptide_ids.begin()); @@ -774,7 +775,7 @@ class PercolatorAdapter : String pout_target_file_proteins(temp_directory_body + txt_designator + "_target_pout_proteins.tab"); String pout_decoy_file_proteins(temp_directory_body + txt_designator + "_decoy_pout_proteins.tab"); - LOG_DEBUG << "Writing percolator input file." << endl; + LOG_DEBUG << "Writing percolator input file." << pin_file << endl; TextFile txt; txt.addLine(ListUtils::concatenate(feature_set, '\t')); preparePin_(all_peptide_ids, feature_set, enz_str, txt, min_charge, max_charge);