diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md new file mode 100644 index 00000000..99162419 --- /dev/null +++ b/.claude/CLAUDE.md @@ -0,0 +1,306 @@ +# Claude Code Universal Behavior Guidelines + +## Overview + +This document defines universal behavior guidelines for Claude Code across all commands and workflows. These principles apply regardless of the specific command being executed. + +## Core Principles + +### 1. Complete Documentation +- Document every action you take in the appropriate JSON file +- Track all files created, modified, or deleted +- Capture task progress and status changes +- Include all relevant context, decisions, and assumptions +- Never assume information is obvious - document everything explicitly + +### 2. Consistent Output Format +- Always use the unified JSON schema (see below) +- Include all required fields for the relevant status +- Use optional fields as needed to provide additional context +- Validate JSON structure before completing work +- Ensure JSON is properly formatted and parseable + +### 3. Session Management & Stateful Resumption +- Claude Code provides a session ID that maintains conversation context automatically +- Always include `session_id` in your output to enable seamless continuation +- When resuming work from a previous session, include `parent_session_id` to link sessions +- The session ID allows Claude Code to preserve full conversation history +- If you need user input, the context is preserved via session ID +- Include enough detail in `session_summary` to understand what was accomplished +- Don't make the user repeat information - session maintains context + +### 4. Task Management +- Track all tasks in JSON output files (NOT in separate markdown files) +- Use hierarchical task IDs: "1.0" for parent, "1.1", "1.2" for children +- Track task status: pending, in_progress, completed, skipped, blocked +- Include task descriptions and any relevant notes +- Update task status as you work +- Document which tasks were completed in each session +- Note any tasks that were skipped and explain why +- When blocked, document the blocker clearly + +### 5. Query Management +- Save all queries to users in the session JSON file +- When querying users, include: + - Clear, specific questions + - Query type (text, multiple_choice, boolean) + - Any relevant context needed to answer + - Query number for reference +- Save user responses in the same JSON file +- Link queries and responses with query numbers + +## File Organization Structure + +All agent-related documents and files must be organized under the `agent-io` directory: + +``` +agent-io/ +├── prds/ +│ └── / +│ ├── humanprompt.md # Original user description of PRD +│ ├── fullprompt.md # Fully fleshed PRD after completion +│ └── data.json # JSON file documenting queries, responses, tasks, etc. +└── docs/ + └── .md # Architecture docs, usage docs, etc. +``` + +### File Organization Guidelines: +- **PRD Files**: Save to `agent-io/prds//` directory + - Each PRD gets its own directory named after the PRD + - Use kebab-case for PRD names (e.g., "user-profile-editing", "payment-integration") + - Directory contains: humanprompt.md, fullprompt.md, and data.json + - The data.json file tracks all queries, responses, tasks, errors, and progress + +- **PRD Storage and Reference**: + - **When user provides a prompt without a PRD name**: + - Analyze the prompt to create a descriptive PRD name (use kebab-case) + - Create directory: `agent-io/prds//` + - Save the original user prompt to `agent-io/prds//humanprompt.md` + - Document the PRD name in your output for future reference + - This allows users to reference this PRD by name in future sessions + + - **When user references an existing PRD by name**: + - Look for the PRD directory: `agent-io/prds//` + - Read available PRD files in order of preference: + 1. `fullprompt.md` - the complete, finalized PRD (if available) + 2. `humanprompt.md` - the original user description + - Use these files as context for the requested work + - Update or create additional files as needed + + - **PRD Naming Best Practices**: + - Use descriptive, feature-focused names + - Keep names concise (2-4 words typically) + - Use kebab-case consistently + - Examples: "user-authentication", "payment-processing", "real-time-notifications" + +- **Documentation Files**: Save to `agent-io/docs/` + - Architecture documentation: `agent-io/docs/-architecture.md` + - Usage documentation: `agent-io/docs/-usage.md` + - Other documentation as appropriate + +- **Code Files**: Save to appropriate project locations + - Follow existing project structure + - Document each file in the JSON tracking file + - Include purpose and type for each file + +### JSON Documentation Files: +- Every PRD must have an associated `data.json` file in its directory +- The data.json file documents: + - Tasks and their status + - Queries to users and their responses + - Errors and problems encountered + - Files created, modified, deleted + - Session information and summaries + - Comments and context + +## Unified JSON Output Schema + +Use this schema for all JSON output files: + +```json +{ + "command_type": "string (create-prd | doc-code-for-dev | doc-code-usage | free-agent | generate-tasks)", + "status": "string (complete | incomplete | user_query | error)", + "session_id": "string - Claude Code session ID for this execution", + "parent_session_id": "string | null - Session ID of previous session when resuming work", + "session_summary": "string - Brief summary of what was accomplished", + + "tasks": [ + { + "task_id": "string (e.g., '1.0', '1.1', '2.0')", + "description": "string", + "status": "string (pending | in_progress | completed | skipped | blocked)", + "parent_task_id": "string | null", + "notes": "string (optional details about completion/issues)" + } + ], + + "files": { + "created": [ + { + "path": "string (relative to working directory)", + "purpose": "string (why this file was created)", + "type": "string (markdown | code | config | documentation)" + } + ], + "modified": [ + { + "path": "string", + "changes": "string (description of modifications)" + } + ], + "deleted": [ + { + "path": "string", + "reason": "string" + } + ] + }, + + "artifacts": { + "prd_filename": "string (for create-prd command)", + "documentation_filename": "string (for doc-code commands)" + }, + + "queries_for_user": [ + { + "query_number": "integer", + "query": "string", + "type": "string (text | multiple_choice | boolean)", + "choices": [ + { + "id": "string", + "value": "string" + } + ], + "response": "string | null - User's response (populated after query is answered)" + } + ], + + "comments": [ + "string - important notes, warnings, observations" + ], + + "context": "string - optional supplementary state details. Session ID preserves full context automatically, so this field is only needed for additional implementation-specific state not captured in the conversation.", + + "metrics": { + "duration_seconds": "number (optional)", + "files_analyzed": "number (optional)", + "lines_of_code": "number (optional)" + }, + + "errors": [ + { + "message": "string", + "type": "string", + "fatal": "boolean" + } + ] +} +``` + +## Required Fields by Status + +### Status: "complete" +- `command_type`, `status`, `session_id`, `session_summary`, `files`, `comments` +- `parent_session_id` (if this session continues work from a previous session) +- Plus any command-specific artifacts (prd_filename, documentation_filename, etc.) +- `tasks` array if the command involves tasks + +### Status: "user_query" +- `command_type`, `status`, `session_id`, `session_summary`, `queries_for_user` +- `files` (for work done so far) +- `comments` (explaining why input is needed) +- `context` (optional - session_id maintains context automatically) +- Note: When user provides answers, they'll create a new session with `parent_session_id` linking back to this one + +### Status: "incomplete" +- `command_type`, `status`, `session_id`, `session_summary`, `files`, `comments` +- Explanation in `comments` of what's incomplete and why +- `errors` array if errors caused incompleteness +- `context` (optional - session_id maintains context automatically) + +### Status: "error" +- `command_type`, `status`, `session_id`, `session_summary`, `errors`, `comments` +- `files` (if any work was done before error) +- `context` (optional - for additional recovery details beyond what session maintains) + +## Error Handling + +When errors occur: +1. Set status to "error" (or "incomplete" if partial work succeeded) +2. Document the error in the `errors` array +3. Include what failed, why it failed, and potential fixes +4. Document any work that was completed before the error +5. Provide context for potential recovery +6. Save error details to the JSON file + +## Code Development Guidelines + +### Keep Code Simple +- Prefer simple, straightforward implementations over clever or complex solutions +- Write code that is easy to read and understand +- Avoid unnecessary abstractions or over-engineering +- Use clear, descriptive variable and function names +- Comment complex logic, but prefer self-documenting code + +### Limit Complexity +- Minimize the number of classes and Python files +- Consolidate related functionality into fewer, well-organized modules +- Only create new files when there's a clear separation of concerns +- Avoid deep inheritance hierarchies +- Prefer composition over inheritance when appropriate + +### Use JSON Schema Validation +- All JSON files must have corresponding JSON schemas +- Validate JSON files against their schemas +- Document the schema in comments or separate schema files +- Use schema validation to catch errors early +- Keep schemas simple and focused + +### Keep Code Management Simple +- Don't use excessive linting rules +- Avoid complex documentation frameworks (like Sphinx) unless truly needed +- Use simple, standard tools (pytest for testing, basic linting) +- Focus on clear code over extensive tooling +- Documentation should be clear markdown files, not generated sites + +## Best Practices + +- **Be Specific**: Include file paths, line numbers, function names +- **Be Complete**: Don't leave out details assuming the user knows them +- **Be Clear**: Write for someone who wasn't watching you work +- **Be Actionable**: Comments should help the user understand next steps +- **Be Honest**: If something is incomplete or uncertain, say so +- **Be Consistent**: Follow the same patterns and conventions throughout +- **Be Thorough**: Test your work and verify it functions correctly +- **Be Organized**: Maintain clean directory structure and file organization + +## Workflow Principles + +### PRD Workflow +1. User provides initial feature description → saved as `humanprompt.md` +2. Complete PRD after workflow → saved as `fullprompt.md` +3. All progress tracked in `.json` + +### Task Workflow +1. Break work into clear, manageable tasks +2. Use hierarchical task IDs (1.0, 1.1, 1.2, 2.0, etc.) +3. Update task status as work progresses +4. Document completed work and any blockers +5. Track everything in JSON file + +### Documentation Workflow +1. Understand the codebase or feature thoroughly +2. Create clear, well-organized documentation +3. Save to appropriate location in `agent-io/docs/` +4. Track file creation and content in JSON output +5. Include examples and practical guidance + +### Query Workflow +1. Only query when genuinely needed +2. Ask clear, specific questions +3. Save query to JSON file with query_number +4. Wait for user response +5. Save response to same JSON file +6. Continue work with provided information diff --git a/.claude/commands/analyze-email.md b/.claude/commands/analyze-email.md new file mode 100644 index 00000000..967e101a --- /dev/null +++ b/.claude/commands/analyze-email.md @@ -0,0 +1,282 @@ +# Command: analyze-email + +## Purpose + +Analyze an email document to extract key information, classify its importance, assign it to relevant projects, identify action items, and prepare a draft response. All analysis results are saved to a structured JSON file for downstream processing. + +## Command Type + +`analyze-email` + +## Input + +You will receive a request file containing: +- Email content (body, subject, sender, recipients) +- Email metadata (date, time, headers) +- User preferences (optional) + +## Process + +### Phase 1: Email Content Analysis + +1. **Read Email Document** + - Parse email subject, body, sender, recipients + - Extract metadata (date, time, CC, BCC if available) + - Identify attachments mentioned or referenced + - Note email thread context if provided + +2. **Extract Key Information** + - Identify main topics and themes + - Extract specific requests or questions + - Note mentioned dates, deadlines, or time-sensitive information + - Identify key stakeholders mentioned + - Extract any reference numbers, project codes, or identifiers + +### Phase 2: Classification + +3. **Classify Email Importance** + - Analyze content and metadata to classify as one of: + - **unimportant**: Mass emails, newsletters, low-priority updates, spam-like content + - **personal**: Personal correspondence, non-work related, social invitations + - **professional**: Work-related, business correspondence, project updates, actionable items + + - Consider these factors: + - Sender relationship (colleague, client, vendor, unknown) + - Subject urgency indicators (urgent, ASAP, deadline, etc.) + - Content type (FYI, action required, question, update) + - Presence of deadlines or action items + - Email thread importance + + - Provide classification confidence score (0.0-1.0) + - Document classification reasoning in comments + - Emails classified as unimportant should not proceed to further processing + +### Phase 3: Task Extraction + +4. **Identify Action Items** + - Scan email for explicit tasks: + - Action verbs (review, approve, send, create, update, etc.) + - Questions requiring responses + - Requests for information or deliverables + - Meeting requests or scheduling needs + + - For each identified task: + - Extract task description + - Determine task type (respond, review, create, schedule, research, etc.) + - Identify task owner (you, sender, other party) + - Extract related context and requirements + +5. **Determine Urgency and Deadlines** + - Analyze for urgency indicators: + - **Critical**: Explicit urgent markers, imminent deadlines (<24 hours), blocking issues + - **High**: Near-term deadlines (1-3 days), important stakeholders, escalations + - **Medium**: Standard deadlines (4-7 days), routine requests, normal priority + - **Low**: Long-term deadlines (>7 days), FYI items, optional tasks + + - Extract deadlines: + - Explicit dates ("by Friday", "before March 15") + - Implicit timeframes ("ASAP", "end of week", "Q1") + - Recurring deadlines ("weekly report", "monthly update") + + - Convert to standardized format (ISO 8601) + - If no deadline specified, suggest reasonable deadline based on urgency + +### Phase 4: Draft Response + +6. **Analyze Response Requirements** + - Determine if response is needed + - Identify key points to address + - Note any questions to answer + - Consider required tone (formal, casual, apologetic, etc.) + - Identify if response requires attachments or follow-up actions + +7. **Generate Draft Response** + - Create draft email response including: + - Appropriate greeting based on sender relationship + - Address all questions and requests + - Confirm understanding of tasks and deadlines + - Propose next steps if applicable + - Professional closing + + - Match tone to original email and relationship + - Keep response concise and actionable + - Include placeholders for information you don't have ([YOUR_INPUT_NEEDED]) + - Add suggested subject line (Re: or continuation) + + - If no response needed, set draft_response to null and explain why + +### Phase 5: Save Structured Output + +8. **Prepare JSON Output File** + - Determine sequence number for email analysis + - Check `orchestrator/email-analysis/` directory for existing analyses + - Use next sequential number (0001, 0002, 0003, etc.) + - If directory doesn't exist, create it and start at 0001 + +9. **Save Analysis File** + - Filename format: `orchestrator/email-analysis/[NNNN]-[YYYY-MM-DD]-[sender-name].json` + - Example: `orchestrator/email-analysis/0042-2025-11-09-john-smith.json` + - Use kebab-case for sender name + - Document the filename in JSON output's `artifacts.analysis_filename` + +## JSON Output Schema + +The analysis JSON file must follow this structure: + +```json +{ + "email_metadata": { + "subject": "string", + "sender": { + "name": "string", + "email": "string" + }, + "recipients": { + "to": ["email1@example.com", "email2@example.com"], + "cc": ["email3@example.com"], + "bcc": [] + }, + "date_received": "ISO 8601 datetime", + "thread_id": "string or null", + "message_id": "string or null", + "attachments": ["filename1.pdf", "filename2.xlsx"] + }, + + "classification": { + "category": "unimportant | personal | professional", + "confidence": 0.95, + "reasoning": "Detailed explanation of classification decision", + "urgency_level": "critical | high | medium | low", + "is_actionable": true, + "sentiment": "positive | neutral | negative | mixed" + }, + + "tasks": [ + { + "task_id": "T001", + "description": "Review and approve the Q4 budget proposal", + "task_type": "review | respond | create | schedule | research | approve | other", + "owner": "self | sender | other", + "urgency": "critical | high | medium | low", + "deadline": { + "date": "ISO 8601 datetime or null", + "is_explicit": true, + "original_text": "by end of week", + "suggested_deadline": "ISO 8601 datetime - if no explicit deadline" + }, + "status": "pending", + "context": "Additional context from email about this task", + "dependencies": ["T002"], + "estimated_effort": "15 minutes | 1 hour | 2 hours | 1 day | 1 week" + } + ], + + "draft_response": { + "should_respond": true, + "response_urgency": "immediate | today | this_week | no_rush", + "suggested_subject": "Re: Q4 Budget Review Request", + "draft_body": "Full draft email body with appropriate greeting, content, and closing", + "tone": "formal | professional | casual | friendly | apologetic", + "requires_attachments": false, + "placeholders": [ + { + "placeholder": "[YOUR_INPUT_NEEDED]", + "description": "Insert your availability for the meeting", + "location": "paragraph 2" + } + ], + "key_points_to_address": [ + "Confirm receipt of budget proposal", + "Provide timeline for review", + "Ask clarifying questions about line items" + ] + }, + + "summary": { + "one_line": "Budget approval request from Finance requiring review by Friday", + "detailed": "Longer summary (2-3 sentences) of email content and required actions", + "key_entities": [ + {"type": "person", "value": "Jane Doe"}, + {"type": "project", "value": "Q4 Budget Planning"}, + {"type": "document", "value": "Budget_Proposal_Q4.xlsx"}, + {"type": "date", "value": "2025-11-15"} + ] + }, + + "analysis_metadata": { + "analyzed_at": "ISO 8601 datetime", + "analysis_version": "1.0", + "model_used": "string", + "processing_time_seconds": 3.45, + "confidence_overall": 0.89, + "requires_human_review": false, + "review_reason": "string or null - why human review is needed" + } +} +``` + +## Command JSON Output Requirements + +Your command execution JSON output must include: + +**Required Fields:** +- `command_type`: "analyze-email" +- `status`: "complete", "user_query", or "error" +- `session_summary`: Brief summary of email analysis +- `files.created`: Array with the analysis JSON file entry +- `artifacts.analysis_filename`: Path to the analysis JSON file +- `artifacts.email_data`: Copy of the email_metadata for quick reference +- `comments`: Array of notes about the analysis process + +**For user_query status:** +- `queries_for_user`: Questions needing clarification +- `context`: Save partial analysis and email content + +**Example Comments:** +- "Email classified as professional with high confidence (0.95)" +- "Identified 3 action items with deadlines ranging from 2-5 days" +- "Draft response prepared; requires user input for meeting availability" +- "No explicit deadlines found; suggested deadlines based on urgency level" + +## Tasks to Track + +Create tasks in the internal todo list: + +``` +1.0 Parse and extract email content +2.0 Classify email importance and urgency +3.0 Extract tasks and deadlines +4.0 Generate draft response +5.0 Save structured JSON file +``` + +Mark tasks as completed as you progress. + +## Quality Checklist + +Before marking complete, verify: +- ✅ Email metadata completely extracted and validated +- ✅ Classification includes confidence score and reasoning +- ✅ All action items extracted with urgency and deadlines +- ✅ Deadlines converted to ISO 8601 format +- ✅ Draft response addresses all key points (if response needed) +- ✅ JSON file saved with correct naming and structure +- ✅ All required JSON schema fields populated +- ✅ Comments include insights about classification and task extraction +- ✅ Edge cases handled (no deadline, no clear tasks, etc.) + +## Error Handling + +Handle these scenarios gracefully: + +1. **Malformed Email**: Return error status with details +2. **No Clear Tasks**: Set tasks array to empty, note in comments +3. **Ambiguous Classification**: Use most likely category, lower confidence score +4. **No Response Needed**: Set draft_response.should_respond to false with explanation + +## Privacy and Security Considerations + +- Ensure sensitive information (passwords, SSNs, credentials) is not logged in comments +- Redact sensitive data in analysis file if present in email +- Document any sensitive content detected in analysis_metadata.requires_human_review +- Do not include full email body in command output JSON, only in analysis file diff --git a/.claude/commands/create-new-project.md b/.claude/commands/create-new-project.md new file mode 100644 index 00000000..b637c50c --- /dev/null +++ b/.claude/commands/create-new-project.md @@ -0,0 +1,637 @@ +# Command: create-new-project + +## Purpose + +Create a new project with complete setup including Cursor workspace configuration, virtual environment management, Claude commands installation, and optional git repository initialization. This command orchestrates multiple setup steps to create a fully configured development environment. + +## Command Type + +`create-new-project` + +## Input + +You will receive a request file containing: +- Project name (required) +- Project directory path (required - can be relative or absolute) +- Project type (optional: python, javascript, typescript, jupyter, multi-language) +- Initialize git repository (optional: boolean, default true) +- Python version for venv (optional: e.g., "3.11", default to system python3) +- Additional workspace folders (optional) +- Workspace settings preferences (optional) + +## Process + +### Phase 1: Create Project Directory + +1. **Setup Project Directory** + - Create project directory at specified path if it doesn't exist + - Convert to absolute path for consistency + - Verify write permissions + - Document the project path + +2. **Validate Project Name** + - Use provided project name or derive from directory name + - Sanitize for use in filenames (remove special characters) + - Check for conflicts with existing projects + - Document the final project name + +### Phase 2: Initialize Git Repository (Optional) + +3. **Git Initialization** + - If git initialization requested (default: true): + - Run: git init + - Create .gitignore file with common patterns + - Create initial commit with project structure + - Document git initialization status + - If git initialization skipped: + - Note in comments why it was skipped + - Continue with setup + +4. **Create .gitignore** + - Add common patterns based on project type: + - Python: venv/, __pycache__/, *.pyc, .pytest_cache/, *.egg-info/ + - JavaScript/Node: node_modules/, dist/, .cache/ + - Jupyter: .ipynb_checkpoints/, notebooks/datacache/ + - General: .DS_Store, .vscode/, *.swp, *.swo + - Claude: .claude/commands/ (managed by claude-commands) + - Keep: .claude/CLAUDE.md, .claude/settings.local.json + - Document .gitignore creation + +### Phase 3: Setup Virtual Environment with venvman + +5. **Register with venvman** + - Run: venvman add PROJECT_NAME PROJECT_PATH + - If Python version specified, create venv with that version + - If not specified, use system default python3 + - Document venvman registration + - Note the virtual environment path + +6. **Activate and Setup Python Environment** + - If project type is Python or Jupyter: + - Install basic dependencies (pip, setuptools, wheel) + - Create requirements.txt if it doesn't exist + - Document Python setup + - If not Python project: + - Note that venv was created but is optional + +### Phase 4: Install Claude Commands + +7. **Register with claude-commands** + - Run: claude-commands addproject PROJECT_PATH + - This installs SYSTEM-PROMPT.md to .claude/CLAUDE.md + - Installs all command files to .claude/commands/ + - Document claude-commands registration + - Count and list installed commands + +### Phase 5: Create Cursor Workspace + +8. **Generate Workspace File** + - Create workspace file: EXCLAMATION + project-name.code-workspace + - Include current directory as primary folder + - Configure settings based on project type + - Add file exclusions (venv/, node_modules/, __pycache__, etc.) + - Add search exclusions for performance + - Document workspace creation + +9. **Configure Project-Specific Settings** + - Python projects: Black formatter, pytest, type checking + - JavaScript/TypeScript: Prettier, ESLint + - Jupyter: Notebook settings, output limits + - Add extension recommendations + - Document all settings configured + +### Phase 6: Create Project Structure + +10. **Create Standard Directories** + - ALWAYS create: agent-io/ directory for Claude command tracking files + - Based on project type, create: + - Python: src/, tests/, docs/ + - Jupyter: notebooks/, notebooks/data/, notebooks/datacache/, notebooks/genomes/, notebooks/models/, notebooks/nboutput/, notebooks/util.py + - JavaScript: src/, tests/, dist/ + - General: docs/, README.md + - Document directory structure created + +11. **Create Initial Files** + - README.md with project name and description + - requirements.txt (for Python projects) + - package.json (for JavaScript projects) + - For Jupyter: notebooks/util.py with NotebookUtil template + - Document files created + +### Phase 7: Finalize Setup + +12. **Create Initial Git Commit (if git enabled)** + - Stage all created files + - Create commit: "Initial project setup: PROJECT_NAME" + - Include setup details in commit message + - Document commit creation + +13. **Generate Setup Summary** + - List all tools registered (venvman, claude-commands) + - List all files and directories created + - Provide next steps for user + - Document complete setup status + +### Phase 8: Save Structured Output + +14. **Save JSON Tracking File** + - IMPORTANT: Save all agent-io output to the NEW project directory, NOT the current working directory + - Create agent-io/ directory in the new project if it doesn't exist + - Save tracking JSON to: NEW_PROJECT_PATH/agent-io/create-new-project-session-SESSIONID.json + - Document all setup steps completed + - List all files and directories created + - Record all command executions + - Note any errors or warnings + - Include completion status + +## JSON Output Schema + +```json +{ + "command_type": "create-new-project", + "status": "complete | incomplete | user_query | error", + "session_id": "string", + "parent_session_id": "string | null", + "session_summary": "Brief summary of project creation", + + "project": { + "name": "string - project name", + "path": "string - absolute path to project", + "type": "python | javascript | typescript | jupyter | multi-language | other" + }, + + "git": { + "initialized": true, + "initial_commit": true, + "commit_hash": "string - git commit hash", + "gitignore_created": true + }, + + "venvman": { + "registered": true, + "command_run": "venvman add PROJECT_NAME PROJECT_PATH", + "venv_path": "string - path to virtual environment", + "python_version": "3.11" + }, + + "claude_commands": { + "registered": true, + "command_run": "claude-commands addproject .", + "commands_installed": 5, + "system_prompt_installed": true, + "commands_list": ["create-prd", "doc-code-for-dev", "doc-code-usage", "jupyter-dev", "cursor-setup"] + }, + + "workspace": { + "filename": "string - workspace file with ! prefix", + "path": "string - absolute path to workspace file", + "folders_count": 1, + "settings_configured": true, + "extensions_recommended": ["ms-python.python", "ms-toolsai.jupyter"] + }, + + "directories_created": [ + "agent-io/", + "src/", + "tests/", + "docs/", + "notebooks/", + "notebooks/data/", + "notebooks/datacache/", + "notebooks/genomes/", + "notebooks/models/", + "notebooks/nboutput/" + ], + + "files": { + "created": [ + { + "path": "!ProjectName.code-workspace", + "purpose": "Cursor workspace configuration", + "type": "config" + }, + { + "path": ".gitignore", + "purpose": "Git ignore patterns", + "type": "config" + }, + { + "path": "README.md", + "purpose": "Project documentation", + "type": "documentation" + }, + { + "path": "requirements.txt", + "purpose": "Python dependencies", + "type": "config" + }, + { + "path": ".claude/CLAUDE.md", + "purpose": "Claude system prompt", + "type": "documentation" + }, + { + "path": "agent-io/create-new-project-session-SESSIONID.json", + "purpose": "Claude command execution tracking for this session", + "type": "tracking" + } + ], + "modified": [] + }, + + "artifacts": { + "project_path": "absolute path to project", + "workspace_file": "path to workspace file", + "readme_file": "path to README.md", + "tracking_file": "agent-io/create-new-project-session-SESSIONID.json" + }, + + "next_steps": [ + "Open workspace: code !ProjectName.code-workspace", + "Activate venv: venvman activate ProjectName", + "Install dependencies: pip install -r requirements.txt", + "Start developing!" + ], + + "comments": [ + "Created project directory at /path/to/project", + "Created agent-io/ directory for Claude command tracking", + "Initialized git repository with initial commit", + "Registered with venvman using Python 3.11", + "Installed 5 Claude commands to .claude/commands/", + "Created Cursor workspace with Python settings", + "Created standard Python project structure (src/, tests/, docs/)", + "Generated README.md and requirements.txt", + "Saved tracking JSON to NEW_PROJECT_PATH/agent-io/" + ], + + "queries_for_user": [], + + "errors": [] +} +``` + +## Command JSON Output Requirements + +**Required Fields:** +- `command_type`: "create-new-project" +- `status`: "complete", "user_query", or "error" +- `session_id`: Session ID for this execution +- `session_summary`: Brief summary of project creation +- `project`: Project details (name, path, type) +- `git`: Git initialization status +- `venvman`: Virtual environment registration +- `claude_commands`: Claude commands registration +- `workspace`: Cursor workspace details +- `directories_created`: List of directories created +- `files`: All files created +- `artifacts`: Key file paths +- `next_steps`: User guidance for next actions +- `comments`: Detailed notes about setup process + +**For user_query status:** +- `queries_for_user`: Questions needing clarification +- `context`: Save partial setup state + +**Example Comments:** +- "Created new project 'MetabolicModeling' at ~/Projects/MetabolicModeling" +- "Initialized git repository with initial commit (abc123f)" +- "Registered with venvman using Python 3.11 at ~/Projects/MetabolicModeling/venv" +- "Installed 5 Claude commands to .claude/commands/" +- "Created Cursor workspace: !MetabolicModeling.code-workspace" +- "Created Jupyter notebook structure with util.py template" +- "Generated .gitignore with Python and Jupyter patterns" + +## .gitignore Template + +### Python Projects +``` +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +venv/ +env/ +ENV/ +.venv +pip-log.txt +pip-delete-this-directory.txt +.pytest_cache/ +.coverage +htmlcov/ +*.egg-info/ +dist/ +build/ + +# Jupyter +.ipynb_checkpoints/ +notebooks/datacache/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db + +# Claude (commands are managed by claude-commands) +.claude/commands/ + +# Agent-IO (Claude command tracking - keep in git for project history) +# agent-io/ is intentionally tracked + +# Keep these +!.claude/CLAUDE.md +!.claude/settings.local.json +``` + +### JavaScript/Node Projects +``` +# Node +node_modules/ +npm-debug.log* +yarn-debug.log* +yarn-error.log* +.pnpm-debug.log* +dist/ +build/ +.cache/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db + +# Claude +.claude/commands/ + +# Agent-IO (keep in git) +# agent-io/ is intentionally tracked + +# Keep these +!.claude/CLAUDE.md +!.claude/settings.local.json +``` + +### Jupyter Projects +``` +# Jupyter +.ipynb_checkpoints/ +notebooks/datacache/ + +# Python +__pycache__/ +*.py[cod] +venv/ +*.egg-info/ + +# Data (keep structure, ignore large files) +notebooks/data/*.csv +notebooks/data/*.tsv +notebooks/data/*.xlsx +notebooks/genomes/*.fasta +notebooks/genomes/*.gbk +notebooks/models/*.xml +notebooks/models/*.json +notebooks/nboutput/* + +# Keep these data directory files +!notebooks/data/.gitkeep +!notebooks/genomes/.gitkeep +!notebooks/models/.gitkeep + +# IDE +.vscode/ +.DS_Store + +# Claude +.claude/commands/ + +# Agent-IO (keep in git) +# agent-io/ is intentionally tracked + +# Keep these +!.claude/CLAUDE.md +``` + +## README.md Template + +```markdown +# PROJECT_NAME + +[Brief project description] + +## Setup + +This project was created with the `create-new-project` Claude command. + +### Prerequisites + +- Python 3.11+ (or appropriate version) +- venvman for virtual environment management +- claude-commands for Claude Code integration + +### Installation + +1. Activate the virtual environment: + ```bash + venvman activate PROJECT_NAME + ``` + +2. Install dependencies: + ```bash + pip install -r requirements.txt + ``` + +### Development + +Open the Cursor workspace: +```bash +code !PROJECT_NAME.code-workspace +``` + +### Project Structure + +- `agent-io/` - Claude command execution tracking and session history +- `src/` - Source code +- `tests/` - Test files +- `docs/` - Documentation +- `notebooks/` - Jupyter notebooks (if applicable) +- `.claude/` - Claude Code configuration (commands managed by claude-commands) + +### Claude Code Integration + +This project includes Claude Code integration: +- Command tracking stored in `agent-io/` for project history +- Commands automatically installed to `.claude/commands/` (managed by claude-commands) +- Update commands: `claude-commands update` + +## License + +[Add license information] +``` + +## Jupyter util.py Template + +For Jupyter projects, create notebooks/util.py: + +```python +import sys +import os +import json +from os import path + +# Add the parent directory to the sys.path +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +script_path = os.path.abspath(__file__) +script_dir = os.path.dirname(script_path) +base_dir = os.path.dirname(os.path.dirname(script_dir)) +folder_name = os.path.basename(script_dir) + +print(base_dir+"/KBUtilLib/src") +sys.path = [base_dir+"/KBUtilLib/src",base_dir+"/cobrakbase",base_dir+"/ModelSEEDpy/"] + sys.path + +# Import utilities with error handling +from kbutillib import NotebookUtils + +import hashlib +import pandas as pd +from modelseedpy import AnnotationOntology, MSPackageManager, MSMedia, MSModelUtil, MSBuilder, MSATPCorrection, MSGapfill, MSGrowthPhenotype, MSGrowthPhenotypes, ModelSEEDBiochem, MSExpression + +class NotebookUtil(NotebookUtils): + def __init__(self,**kwargs): + super().__init__( + notebook_folder=script_dir, + name="PROJECT_NAME", + user="chenry", + retries=5, + proxy_port=None, + **kwargs + ) + + # PLACE ALL UTILITY FUNCTIONS NEEDED FOR NOTEBOOKS HERE + +# Initialize the NotebookUtil instance +util = NotebookUtil() +``` + +## Quality Checklist + +Before marking complete, verify: +- ✅ Project directory created at specified path +- ✅ agent-io/ directory created in NEW project directory +- ✅ Git repository initialized (if requested) +- ✅ .gitignore created with appropriate patterns (agent-io/ kept in git) +- ✅ Initial git commit created (if git enabled) +- ✅ Registered with venvman successfully +- ✅ Virtual environment created with correct Python version +- ✅ Registered with claude-commands successfully +- ✅ Claude commands and SYSTEM-PROMPT installed to .claude/ +- ✅ Cursor workspace file created with exclamation prefix +- ✅ Workspace settings configured for project type +- ✅ Standard directory structure created +- ✅ README.md generated with project info +- ✅ requirements.txt or package.json created (if applicable) +- ✅ For Jupyter: notebooks/util.py created with project name +- ✅ All setup steps documented in comments +- ✅ Tracking JSON saved to NEW_PROJECT_PATH/agent-io/ directory +- ✅ Next steps provided for user + +## Error Handling + +Handle these scenarios gracefully: + +1. **Directory Already Exists**: Ask user whether to use existing or create new name +2. **Git Not Installed**: Skip git initialization, note in comments +3. **venvman Not Found**: Note error, continue with other setup steps +4. **claude-commands Not Found**: Note error, continue with other setup steps +5. **Permission Issues**: Document error and suggest manual fix +6. **Invalid Project Name**: Sanitize name and notify user of changes +7. **Python Version Not Available**: Fall back to system default, note in comments + +## Command Execution Order + +Critical: Execute commands in this exact order to avoid conflicts: + +1. Create project directory +2. Change to project directory +3. Create agent-io/ directory +4. Initialize git (optional) +5. Create .gitignore +6. Register with venvman +7. Register with claude-commands +8. Create workspace file +9. Create directory structure (including agent-io/) +10. Create initial files +11. Create initial git commit (if enabled) +12. Save tracking file to NEW_PROJECT_PATH/agent-io/ + +## Integration Notes + +### venvman Integration +- venvman stores virtual environments centrally +- Command: `venvman add PROJECT_NAME PROJECT_PATH` +- Activate with: `venvman activate PROJECT_NAME` +- List all: `venvman list` + +### claude-commands Integration +- Installs commands to .claude/commands/ +- Updates can be pulled with: `claude-commands update` +- List tracked projects: `claude-commands list` + +### Cursor Workspace +- Workspace file appears at top of directory (! prefix) +- Open with: `code !PROJECT_NAME.code-workspace` +- Settings are project-specific and version-controlled + +## Privacy and Security Considerations + +- Don't include API keys or credentials in generated files +- .gitignore should exclude sensitive data directories +- README template should not expose internal paths +- Virtual environment paths are local, not in git +- .claude/commands/ excluded from git (managed by claude-commands) +- Keep .claude/CLAUDE.md in git for project-specific settings + +## Next Steps After Project Creation + +Provide users with clear next steps: + +1. **Open Workspace** + ```bash + code !PROJECT_NAME.code-workspace + ``` + +2. **Activate Virtual Environment** + ```bash + venvman activate PROJECT_NAME + ``` + +3. **Install Dependencies** + ```bash + pip install -r requirements.txt + # or + npm install + ``` + +4. **Start Development** + - Begin coding in src/ + - Write tests in tests/ + - Document in docs/ + - For Jupyter: Create notebooks in notebooks/ + +5. **Commit Changes** + ```bash + git add . + git commit -m "Add initial implementation" + ``` diff --git a/.claude/commands/create-prd.md b/.claude/commands/create-prd.md new file mode 100644 index 00000000..e6794631 --- /dev/null +++ b/.claude/commands/create-prd.md @@ -0,0 +1,174 @@ +# Command: create-prd + +## Purpose + +Generate a comprehensive Product Requirements Document (PRD) from a user's feature request. The PRD should be clear, actionable, and suitable for a junior developer to understand and implement. + +## Command Type + +`create-prd` + +## Input + +You will receive a request file containing: +- Initial feature description or request +- Any existing context about the product/system +- Target users or stakeholders + +## Process + +### Phase 1: Clarification + +1. **Analyze the Request** + - Read the feature request carefully + - Identify what information is provided + - Identify what critical information is missing + +2. **Ask Clarifying Questions** (if needed) + - Ask about problem/goal: "What problem does this feature solve?" + - Ask about target users: "Who is the primary user?" + - Ask about core functionality: "What are the key actions users should perform?" + - Ask for user stories: "As a [user], I want to [action] so that [benefit]" + - Ask about acceptance criteria: "How will we know this is successfully implemented?" + - Ask about scope: "What should this feature NOT do?" + - Ask about data requirements: "What data needs to be displayed or manipulated?" + - Ask about design/UI: "Are there mockups or UI guidelines?" + - Ask about edge cases: "What potential error conditions should we consider?" + + **Important**: Only ask questions where the answer is not already clear from the request. Make reasonable assumptions and document them in comments. + +### Phase 2: PRD Generation + +3. **Generate PRD Markdown** + - Create a comprehensive PRD following the structure below + - Write for a junior developer audience + - Be explicit and unambiguous + - Avoid jargon where possible + +4. **Determine PRD Directory Name** + - Convert feature name to kebab-case + - Example: "User Profile Editing" → "user-profile-editing" + +5. **Save PRD Files** + - Create directory: `agent-io/prds//` + - Save user's original request to: `agent-io/prds//humanprompt.md` + - Save complete PRD to: `agent-io/prds//fullprompt.md` + - Create JSON tracking file: `agent-io/prds//.json` + - Document the filename in JSON output's `artifacts.prd_filename` + +## PRD Structure + +Your PRD markdown file must include these sections: + +```markdown +# PRD: [Feature Name] + +## Introduction/Overview +Brief description of the feature and the problem it solves. State the primary goal. + +## Goals +List specific, measurable objectives for this feature: +1. [Goal 1] +2. [Goal 2] +3. [Goal 3] + +## User Stories +Detail user narratives describing feature usage and benefits: + +**As a** [type of user] +**I want to** [perform some action] +**So that** [I can achieve some benefit] + +(Include 3-5 user stories) + +## Functional Requirements + +List specific functionalities the feature must have. Use clear, concise language. Number each requirement. + +1. The system must [specific requirement] +2. The system must [specific requirement] +3. Users must be able to [specific action] +4. The feature must [specific behavior] + +## Non-Goals (Out of Scope) + +Clearly state what this feature will NOT include: +- [Non-goal 1] +- [Non-goal 2] +- [Non-goal 3] + +## Design Considerations + +(Optional - include if relevant) +- Link to mockups or design files +- Describe UI/UX requirements +- Mention relevant components or design system elements +- Note accessibility requirements + +## Technical Considerations + +(Optional - include if relevant) +- Known technical constraints +- Dependencies on other systems or modules +- Performance requirements +- Security considerations +- Scalability concerns + +## Success Metrics + +How will the success of this feature be measured? +- [Metric 1: e.g., "Increase user engagement by 10%"] +- [Metric 2: e.g., "Reduce support tickets related to X by 25%"] +- [Metric 3: e.g., "90% of users complete the flow without errors"] + +## Open Questions + +List any remaining questions or areas needing further clarification: +1. [Question 1] +2. [Question 2] +``` + +## Tasks to Track + +Create tasks in the JSON output: + +``` +1.0 Clarify requirements (if questions needed) +2.0 Generate PRD content +3.0 Save PRD file +``` + +Mark tasks as completed as you progress. + +## JSON Output Requirements + +Your JSON output must include: + +**Required Fields:** +- `command_type`: "create-prd" +- `status`: "complete", "user_query", or "error" +- `session_summary`: Brief summary of PRD creation +- `files.created`: Array with the PRD file entry +- `artifacts.prd_filename`: Path to the PRD file +- `comments`: Array of notes (e.g., assumptions made, important decisions) + +**For user_query status:** +- `queries_for_user`: Your clarifying questions +- `context`: Save the initial request and any partial work + +**Example Comments:** +- "Assumed feature is for logged-in users only" +- "PRD written for web interface; mobile considerations noted as future enhancement" +- "No existing user authentication system mentioned; included as technical dependency" + +## Quality Checklist + +Before marking complete, verify: +- ✅ PRD includes all required sections +- ✅ Requirements are specific and measurable +- ✅ User stories follow the standard format +- ✅ Non-goals are clearly stated +- ✅ PRD is understandable by a junior developer +- ✅ File saved to correct location with correct naming +- ✅ JSON output includes all required fields +- ✅ All assumptions documented in comments diff --git a/.claude/commands/cursor-setup.md b/.claude/commands/cursor-setup.md new file mode 100644 index 00000000..526159f5 --- /dev/null +++ b/.claude/commands/cursor-setup.md @@ -0,0 +1,379 @@ +# Command: cursor-setup + +## Purpose + +Create a Cursor workspace file for the current project directory, enabling multi-root workspace features, custom settings, and organized project management in Cursor IDE. + +## Command Type + +`cursor-setup` + +## Input + +You will receive a request file containing: +- Project name (required) +- Additional workspace folders to include (optional) +- Workspace-specific settings (optional) +- Extensions to recommend (optional) + +## Process + +### Phase 1: Gather Project Information + +1. **Determine Project Name** + - Use project name from input request + - If not provided, derive from current directory name + - Sanitize name for filename use (remove special characters) + - Document the project name + +2. **Identify Project Structure** + - Examine current directory structure + - Identify key folders (src, tests, docs, etc.) + - Note any existing configuration files (.vscode, .cursor, etc.) + - Document project type (Python, Node.js, multi-language, etc.) + +### Phase 2: Create Workspace File + +3. **Generate Workspace Configuration** + - Create workspace file with naming pattern: EXCLAMATION-project-name.code-workspace + - The exclamation mark prefix ensures the file appears at top of directory listings + - Include current directory as primary folder + - Add any additional folders specified in request + - Configure workspace settings appropriate for project type + +4. **Configure Workspace Settings** + - Add workspace-level settings for: + - File associations + - Editor preferences + - Language-specific settings + - Search exclusions + - Extension recommendations + - Preserve any existing settings from .vscode/settings.json + - Document all settings added + +### Phase 3: Register with ClaudeCommands + +5. **Add Project to ClaudeCommands Database** + - Run the command: claude-commands addproject . + - This registers the project directory in the ClaudeCommands tracking system + - Installs the latest Claude commands and SYSTEM-PROMPT.md to the project + - Document the registration in comments + - If the command fails, note the error but continue with workspace setup + +### Phase 4: Validate and Document + +6. **Validate Workspace File** + - Verify JSON structure is valid + - Ensure all paths are relative to workspace file location + - Check that workspace file can be opened in Cursor + - Document workspace structure + +7. **Create Documentation** + - Document workspace file location + - Explain workspace structure + - List any workspace-specific settings + - Provide usage instructions + +### Phase 5: Save Structured Output + +8. **Save JSON Tracking File** + - Document workspace file creation + - List all settings configured + - Note any issues or recommendations + - Include completion status + +## Workspace File Template + +The workspace file should follow this structure: + +```json +{ + "folders": [ + { + "path": ".", + "name": "" + } + ], + "settings": { + "files.exclude": { + "**/__pycache__": true, + "**/*.pyc": true, + "**/.pytest_cache": true, + "**/.DS_Store": true, + "**/node_modules": true, + "**/.git": false + }, + "search.exclude": { + "**/__pycache__": true, + "**/*.pyc": true, + "**/node_modules": true, + "**/.git": true + }, + "files.watcherExclude": { + "**/__pycache__/**": true, + "**/node_modules/**": true + } + }, + "extensions": { + "recommendations": [] + } +} +``` + +### Workspace Settings by Project Type + +**Python Projects:** +```json +{ + "python.analysis.typeCheckingMode": "basic", + "python.analysis.autoImportCompletions": true, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true, + "editor.codeActionsOnSave": { + "source.organizeImports": true + } + }, + "files.exclude": { + "**/__pycache__": true, + "**/*.pyc": true, + "**/.pytest_cache": true + } +} +``` + +**Node.js/JavaScript Projects:** +```json +{ + "[javascript]": { + "editor.defaultFormatter": "esbenp.prettier-vscode", + "editor.formatOnSave": true + }, + "[typescript]": { + "editor.defaultFormatter": "esbenp.prettier-vscode", + "editor.formatOnSave": true + }, + "files.exclude": { + "**/node_modules": true, + "**/dist": true, + "**/.cache": true + } +} +``` + +**Jupyter Notebook Projects:** +```json +{ + "jupyter.notebookFileRoot": "${workspaceFolder}/notebooks", + "notebook.output.textLineLimit": 500, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter" + }, + "files.exclude": { + "**/.ipynb_checkpoints": true, + "**/__pycache__": true + } +} +``` + +## JSON Output Schema + +```json +{ + "command_type": "cursor-setup", + "status": "complete | incomplete | user_query | error", + "session_id": "string", + "parent_session_id": "string | null", + "session_summary": "Brief summary of workspace setup", + + "project": { + "name": "string - project name", + "type": "python | javascript | typescript | jupyter | multi-language | other", + "workspace_filename": "string - filename with ! prefix" + }, + + "workspace": { + "folders": [ + { + "path": "string - relative path", + "name": "string - folder display name" + } + ], + "settings_count": 10, + "extensions_recommended": 3 + }, + + "claude_commands": { + "registered": true, + "command_run": "claude-commands addproject .", + "commands_installed": 5, + "system_prompt_installed": true + }, + + "files": { + "created": [ + { + "path": "string - workspace file with ! prefix", + "purpose": "Cursor workspace configuration", + "type": "config" + } + ], + "modified": [] + }, + + "artifacts": { + "workspace_filename": "string - workspace file with ! prefix", + "workspace_path": "absolute path to workspace file" + }, + + "comments": [ + "Created workspace file with name prefix '!' for top sorting", + "Configured Python-specific settings for project", + "Added file exclusions for __pycache__ and .pyc files", + "Workspace can be opened in Cursor via File > Open Workspace", + "Registered project with ClaudeCommands database", + "Installed 5 Claude commands to .claude/commands/" + ], + + "queries_for_user": [], + + "errors": [] +} +``` + +## Command JSON Output Requirements + +**Required Fields:** +- `command_type`: "cursor-setup" +- `status`: "complete", "user_query", or "error" +- `session_id`: Session ID for this execution +- `session_summary`: Brief summary of workspace creation +- `project`: Project name and workspace details +- `workspace`: Configuration details +- `claude_commands`: Registration status with ClaudeCommands database +- `files`: Workspace file created +- `artifacts`: Path to workspace file +- `comments`: Notes about workspace configuration + +**For user_query status:** +- `queries_for_user`: Questions about project structure or preferences +- `context`: Save partial workspace configuration + +**Example Comments:** +- "Created workspace file with exclamation prefix for top sorting" +- "Configured Python development settings with Black formatter" +- "Added exclusions for common Python cache directories" +- "Included notebooks/ folder as additional workspace folder" +- "Recommended extensions: Python, Jupyter, Black Formatter" + +## Workspace File Naming Convention + +The workspace file must be named with an exclamation mark prefix followed by the project name and .code-workspace extension. + +Format: EXCLAMATION + project-name + .code-workspace + +**Why the exclamation mark prefix?** +- Ensures workspace file appears at top of alphabetical directory listings +- Makes workspace file easy to find and identify +- Common convention for important configuration files +- Visual indicator of workspace root file + +**Examples:** +- Exclamation mark + MetabolicModeling.code-workspace +- Exclamation mark + ClaudeCommands.code-workspace +- Exclamation mark + WebsiteRedesign.code-workspace + +## Quality Checklist + +Before marking complete, verify: +- ✅ Workspace file created with exclamation mark prefix in filename +- ✅ JSON structure is valid and properly formatted +- ✅ Current directory included as primary folder +- ✅ Workspace settings appropriate for project type +- ✅ File exclusions configured to hide build artifacts +- ✅ Search exclusions configured for better performance +- ✅ Extension recommendations included (if applicable) +- ✅ All paths are relative to workspace file location +- ✅ Workspace file can be opened in Cursor +- ✅ Project registered with ClaudeCommands (claude-commands addproject .) +- ✅ Claude commands and SYSTEM-PROMPT installed to .claude/ directory +- ✅ Documentation includes usage instructions + +## Error Handling + +Handle these scenarios gracefully: + +1. **No Project Name**: Use current directory name as fallback +2. **Existing Workspace File**: Ask user whether to overwrite or merge +3. **Invalid Characters in Name**: Sanitize project name for filename +4. **Unknown Project Type**: Use generic workspace template +5. **Permission Issues**: Document if unable to write file +6. **ClaudeCommands Not Found**: Note error in comments, continue with workspace setup + +## Usage Instructions + +After creating workspace file, users can: + +1. **Open Workspace in Cursor** + - File > Open Workspace from File + - Select the workspace file (begins with exclamation mark) + - Or double-click the workspace file + +2. **Benefits of Workspace** + - Consistent settings across team members + - Multi-root folder support + - Workspace-specific extensions + - Organized project structure + - Easy project switching + +3. **Customization** + - Edit workspace file to add more folders + - Add custom tasks and launch configurations + - Configure language-specific settings + - Add extension recommendations + +## Advanced Workspace Features + +Optionally include these advanced features: + +**Tasks Configuration:** +```json +{ + "tasks": { + "version": "2.0.0", + "tasks": [ + { + "label": "Run Tests", + "type": "shell", + "command": "pytest", + "group": "test" + } + ] + } +} +``` + +**Launch Configurations:** +```json +{ + "launch": { + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Current File", + "type": "python", + "request": "launch", + "program": "${file}" + } + ] + } +} +``` + +## Privacy and Security Considerations + +- Don't include absolute paths that expose user directory structure +- Use relative paths for all folder references +- Don't include API keys or credentials in workspace settings +- Don't commit sensitive workspace settings to version control +- Use workspace file for team-shared settings only diff --git a/.claude/commands/doc-code-for-dev.md b/.claude/commands/doc-code-for-dev.md new file mode 100644 index 00000000..3b883034 --- /dev/null +++ b/.claude/commands/doc-code-for-dev.md @@ -0,0 +1,312 @@ +# Command: doc-code-for-dev + +## Purpose + +Create comprehensive architecture documentation that enables developers (and AI agents) to understand, modify, and extend a codebase. This is internal documentation about HOW the code works, not how to USE it. + +## Command Type + +`doc-code-for-dev` + +## Core Directive + +**YOUR ONLY JOB**: Document and explain the codebase as it exists today. + +**DO NOT:** +- Suggest improvements or changes +- Perform root cause analysis +- Propose future enhancements +- Critique the implementation +- Recommend refactoring or optimization +- Identify problems + +**ONLY:** +- Describe what exists +- Explain where components are located +- Show how systems work +- Document how components interact +- Map the technical architecture + +## Input + +You will receive a request file containing: +- Path to the codebase to document +- Optional: Specific areas to focus on +- Optional: Known entry points or key files + +## What to Document + +### 1. Project Structure +- Directory organization and purpose +- File naming conventions +- Module relationships and dependencies +- Configuration file locations + +### 2. Architectural Patterns +- Overall design patterns (MVC, microservices, etc.) +- Key abstractions and their purposes +- Separation of concerns +- Layering strategy + +### 3. Component Relationships +- How modules interact +- Data flow between components +- Dependency graphs +- Service boundaries + +### 4. Data Models +- Core data structures and classes +- Database schemas (if applicable) +- State management approach +- Data persistence strategy + +### 5. Key Algorithms and Logic +- Where business logic lives +- Complex algorithms and their purposes +- Decision points and control flow +- Critical code paths + +### 6. Extension Points +- Plugin systems or hooks +- Abstract classes meant to be extended +- Configuration-driven behavior +- Where to add new features + +### 7. Internal APIs +- Private/internal interfaces between modules +- Service contracts +- Communication protocols +- Message formats + +### 8. Development Setup +- Build system and tools +- Testing framework +- Development dependencies +- How to run locally + +## Research Process + +1. **Map the Structure** + - Generate directory tree + - Identify purpose of each major directory + - Locate configuration files + - Find entry points (main files, index files) + +2. **Identify Core Components** + - What are the main modules/packages? + - What is each component responsible for? + - What are key classes and functions? + - How are components named? + +3. **Trace Data Flow** + - Follow data from entry point to storage + - Identify transformations + - Map processing stages + - Document state changes + +4. **Understand Patterns** + - What design patterns are used? + - How is state managed? + - How are errors handled? + - What conventions are followed? + +5. **Find Extension Mechanisms** + - Where can new features be added? + - What patterns should be followed? + - What interfaces need implementation? + - How are plugins/extensions loaded? + +6. **Document Build/Test** + - How to set up development environment + - How to run tests + - How to build/compile + - What tools are required + +## Documentation Structure + +Create a markdown file with this structure: + +```markdown +# [Project Name] - Architecture Documentation + +## Overview +High-level description of system architecture and design philosophy. +Include: What this system does, key technologies, architectural approach. + +## Project Structure +``` +project/ +├── module1/ # Purpose: [description] +│ ├── submodule/ # Purpose: [description] +│ └── core.py # [description] +├── module2/ # Purpose: [description] +└── tests/ # Purpose: [description] +``` + +## Core Components + +### Component: [Name] +- **Location**: `path/to/component` +- **Purpose**: [What this component does] +- **Key Classes/Functions**: + - `ClassName`: [Description and role] + - `function_name()`: [Description and role] +- **Dependencies**: [What it depends on] +- **Used By**: [What depends on it] + +[Repeat for each major component] + +## Architecture Patterns + +### Pattern: [Name] +- **Where Used**: [Locations in codebase] +- **Purpose**: [Why this pattern is used] +- **Implementation**: [How it's implemented] +- **Key Classes**: [Classes involved] + +## Data Flow + +### Flow: [Name] +``` +Entry Point → Component A → Component B → Storage +``` +- **Description**: [Detailed explanation] +- **Transformations**: [What happens at each stage] +- **Error Handling**: [How errors are managed] + +## Data Models + +### Model: [Name] +- **Location**: `path/to/model` +- **Purpose**: [What this represents] +- **Key Fields**: + - `field_name` (type): [Description] +- **Relationships**: [Relations to other models] +- **Persistence**: [How/where stored] + +## Module Dependencies + +``` +module1 + ├─ depends on: module2, module3 + └─ used by: module4 + +module2 + ├─ depends on: module3 + └─ used by: module1, module5 +``` + +## Key Algorithms + +### Algorithm: [Name] +- **Location**: `path/to/file:line_number` +- **Purpose**: [What problem it solves] +- **Input**: [What it takes] +- **Output**: [What it produces] +- **Complexity**: [Time/space if relevant] +- **Critical Details**: [Important notes] + +## Extension Points + +### Extension Point: [Name] +- **How to Extend**: [Instructions] +- **Required Interface**: [What must be implemented] +- **Examples**: [Existing implementations] +- **Integration**: [How extensions are registered] + +## State Management +- **Where State Lives**: [Description] +- **State Lifecycle**: [Creation, modification, destruction] +- **Concurrency**: [How concurrent access handled] +- **Persistence**: [How state is saved/loaded] + +## Error Handling Strategy +- **Exception Hierarchy**: [Custom exceptions] +- **Error Propagation**: [How errors bubble up] +- **Recovery Mechanisms**: [How failures handled] +- **Logging**: [Where errors are logged] + +## Testing Architecture +- **Test Organization**: [How tests structured] +- **Test Types**: [Unit, integration, e2e] +- **Fixtures and Mocks**: [Common utilities] +- **Running Tests**: [Commands to run tests] + +## Development Setup + +### Prerequisites +- [Required tools and versions] +- [System dependencies] + +### Setup Steps +1. [Clone and install] +2. [Configuration] +3. [Database setup if applicable] +4. [Verification] + +### Build System +- [Build commands] +- [Artifacts produced] +- [Build configuration] + +## Important Conventions +- [Naming conventions] +- [Code organization patterns] +- [Documentation standards] + +## Critical Files +- `file.py`: [Why important] +- `config.yaml`: [Configuration structure] +- `schema.sql`: [Database schema] + +## Glossary +- **Term**: [Definition in context of this codebase] +``` + +## Output Files + +1. **Save Documentation** + - Filename: `agent-io/docs/[project-name]-architecture.md` + - Create `agent-io/docs/` directory if it doesn't exist + - Use kebab-case for project name + +2. **Reference in JSON** + - Add to `artifacts.documentation_filename` + - Add to `files.created` array + +## JSON Output Requirements + +**Required Fields:** +- `command_type`: "doc-code-for-dev" +- `status`: "complete", "user_query", or "error" +- `session_summary`: Brief summary of documentation created +- `files.created`: Array with the documentation file +- `artifacts.documentation_filename`: Path to documentation +- `comments`: Important observations and notes + +**Optional Fields:** +- `metrics.files_analyzed`: Number of files examined +- `metrics.lines_of_code`: Total LOC in codebase + +**Example Comments:** +- "Analyzed 147 files across 12 modules" +- "Identified MVC pattern throughout web layer" +- "Found plugin system using abstract base classes" +- "Database uses SQLAlchemy ORM with 23 models" +- "Note: Some circular dependencies between auth and user modules" + +## Quality Checklist + +Before marking complete, verify: +- ✅ Complete project structure mapped with purposes +- ✅ All major components documented with responsibilities +- ✅ Architectural patterns identified and explained +- ✅ Data flow through system clearly traced +- ✅ Module dependencies visualized +- ✅ Extension points identified with examples +- ✅ Development setup instructions provided +- ✅ Key algorithms documented with locations +- ✅ State management strategy explained +- ✅ A developer can start contributing in < 30 minutes +- ✅ Documentation is in markdown format +- ✅ No suggestions for improvements (only documentation) diff --git a/.claude/commands/doc-code-usage.md b/.claude/commands/doc-code-usage.md new file mode 100644 index 00000000..c2451aa4 --- /dev/null +++ b/.claude/commands/doc-code-usage.md @@ -0,0 +1,403 @@ +# Command: doc-code-usage + +## Purpose + +Create comprehensive usage documentation that shows developers how to USE a codebase as a library, tool, or API. This is external-facing documentation for consumers of the code, not for those modifying it. + +## Command Type + +`doc-code-usage` + +## Core Directive + +**YOUR ONLY JOB**: Document how to use the code as it exists today. + +**DO NOT:** +- Document internal implementation details +- Explain code architecture or design patterns +- Suggest improvements or changes +- Document private methods or internal APIs +- Explain how to modify or extend the codebase + +**ONLY:** +- Document public APIs +- Show how to install and import +- Provide usage examples +- Document command-line interfaces +- Explain configuration options +- Document input/output formats + +## Input + +You will receive a request file containing: +- Path to the codebase to document +- Optional: Type of interface (library, CLI, API) +- Optional: Target audience (beginner, advanced) + +## What to Document + +### 1. Public APIs +- All public classes, functions, and methods +- Function signatures with parameter types +- Return types and values +- Exceptions that may be raised +- Usage examples for each major API + +### 2. Command-Line Interfaces +- All CLI commands and subcommands +- Flags, options, and arguments +- Input/output formats +- Usage examples +- Common workflows + +### 3. Configuration +- Configuration files and formats +- Environment variables +- Default values +- Required vs optional settings +- Configuration examples + +### 4. Entry Points +- Installation instructions +- Import statements +- Main entry points for different use cases +- Quick start guide +- First-run setup + +### 5. Data Formats +- Input data structures and schemas +- Output data structures and schemas +- File formats (if applicable) +- Data validation rules +- Example data + +### 6. Error Handling +- Common errors users might encounter +- Error messages and their meanings +- Exception types that may be raised +- How to handle errors +- Troubleshooting guide + +## Research Process + +1. **Identify Entry Points** + - Scan for main() functions + - Look for CLI definitions + - Find package exports + - Check setup.py, package.json, etc. + +2. **Map Public APIs** + - Find all public-facing modules + - Identify public classes and functions + - Distinguish public from private/internal + - Check for docstrings and type hints + +3. **Extract Signatures** + - Document all parameters with types + - Document return values + - Note any decorators + - Capture default values + +4. **Find Examples** + - Look in README files + - Check documentation folders + - Examine test files for usage patterns + - Find example directories + - Check docstrings for examples + +5. **Document Configuration** + - Find config files + - Identify environment variables + - Document all options + - Note defaults and requirements + +## Documentation Structure + +Create a markdown file with this structure: + +```markdown +# [Project Name] - Usage Documentation + +## Overview +Brief description of what this code does and who should use it. +Include: Purpose, key features, target users. + +## Installation + +### Requirements +- [Language/runtime version] +- [Required dependencies] +- [System requirements] + +### Install via [Package Manager] +```bash +[installation command] +``` + +### Install from Source +```bash +[clone and install commands] +``` + +## Quick Start + +[Minimal example to get started - 5-10 lines] + +```[language] +# Simple example that demonstrates basic usage +``` + +## API Reference + +### Module: [module_name] + +#### Class: [ClassName] + +Brief description of what this class does. + +**Constructor** +```[language] +ClassName(param1: type, param2: type = default) +``` + +**Parameters:** +- `param1` (type): Description +- `param2` (type, optional): Description. Defaults to `default`. + +**Example:** +```[language] +# Example usage +``` + +#### Method: [method_name] + +Brief description of what this method does. + +```[language] +method_name(param1: type, param2: type) -> return_type +``` + +**Parameters:** +- `param1` (type): Description +- `param2` (type): Description + +**Returns:** +- `return_type`: Description of return value + +**Raises:** +- `ExceptionType`: When this exception is raised + +**Example:** +```[language] +# Example usage +``` + +### Function: [function_name] + +Brief description of what this function does. + +```[language] +function_name(param1: type, param2: type = default) -> return_type +``` + +**Parameters:** +- `param1` (type): Description +- `param2` (type, optional): Description. Defaults to `default`. + +**Returns:** +- `return_type`: Description + +**Example:** +```[language] +# Example usage +``` + +## Command-Line Interface + +(Include this section if the code has a CLI) + +### Command: [command_name] + +Brief description of what this command does. + +**Usage:** +```bash +command_name [options] +``` + +**Options:** +- `-f, --flag`: Description +- `-o, --option `: Description + +**Arguments:** +- ``: Description (required) +- `[arg]`: Description (optional) + +**Examples:** +```bash +# Example 1: Basic usage +command_name file.txt + +# Example 2: With options +command_name --flag --option value file.txt +``` + +## Configuration + +### Configuration File + +[Project Name] can be configured using `config.[ext]`: + +```[format] +# Example configuration +option1: value1 +option2: value2 +``` + +**Options:** +- `option1`: Description. Default: `default1` +- `option2`: Description. Default: `default2` + +### Environment Variables + +- `ENV_VAR_NAME`: Description. Default: `default` +- `ANOTHER_VAR`: Description. Required if [condition] + +## Data Formats + +### Input Format + +Description of expected input format. + +**Example:** +```[format] +{ + "field1": "value1", + "field2": "value2" +} +``` + +### Output Format + +Description of output format. + +**Example:** +```[format] +{ + "result": "value", + "status": "success" +} +``` + +## Error Reference + +### Common Errors + +**Error: [Error Message]** +- **Cause**: Why this error occurs +- **Solution**: How to fix it + +**Exception: [ExceptionType]** +- **When**: When this exception is raised +- **Handling**: How to catch and handle it +- **Example**: +```[language] +try: + # code that might raise exception +except ExceptionType as e: + # handle error +``` + +## Examples + +### Example 1: [Use Case Name] + +Description of this use case. + +```[language] +# Complete working example +``` + +### Example 2: [Use Case Name] + +Description of this use case. + +```[language] +# Complete working example +``` + +## Advanced Usage + +(Optional section for complex features) + +### [Advanced Feature Name] + +Description and examples of advanced usage. + +## Troubleshooting + +**Problem**: [Common problem] +**Solution**: [How to solve it] + +**Problem**: [Another problem] +**Solution**: [How to solve it] + +## API Stability + +(If relevant) +- Note which APIs are stable vs experimental +- Deprecation warnings +- Version compatibility + +## Further Resources + +- Documentation: [link] +- Examples: [link] +- Community: [link] +``` + +## Output Files + +1. **Save Documentation** + - Filename: `agent-io/docs/[project-name]-usage.md` + - Create `agent-io/docs/` directory if it doesn't exist + - Use kebab-case for project name + +2. **Reference in JSON** + - Add to `artifacts.documentation_filename` + - Add to `files.created` array + +## JSON Output Requirements + +**Required Fields:** +- `command_type`: "doc-code-usage" +- `status`: "complete", "user_query", or "error" +- `session_summary`: Brief summary of documentation created +- `files.created`: Array with the documentation file +- `artifacts.documentation_filename`: Path to documentation +- `comments`: Important observations and notes + +**Optional Fields:** +- `metrics.files_analyzed`: Number of files examined +- Number of public APIs documented + +**Example Comments:** +- "Documented 47 public functions across 8 modules" +- "Found comprehensive CLI with 12 commands" +- "Note: Some functions have minimal docstrings - documented based on code analysis" +- "Configuration supports both .yaml and .json formats" +- "Library supports Python 3.8+" + +## Quality Checklist + +Before marking complete, verify: +- ✅ All public APIs documented with signatures and examples +- ✅ All CLI commands documented with usage examples +- ✅ Configuration options clearly explained +- ✅ Quick start guide enables first use in < 5 minutes +- ✅ Error reference covers common issues +- ✅ Documentation is organized and easy to navigate +- ✅ No internal/private implementation details leaked +- ✅ Examples are practical and copy-pasteable +- ✅ Installation instructions are clear +- ✅ Parameter types and return types documented diff --git a/.claude/commands/free-agent.md b/.claude/commands/free-agent.md new file mode 100644 index 00000000..785eae2d --- /dev/null +++ b/.claude/commands/free-agent.md @@ -0,0 +1,425 @@ +# Command: free-agent + +## Purpose + +Execute simple, well-defined tasks from natural language requests. This is for straightforward operations like file management, git operations, system tasks, data processing, and other common development activities. + +## Command Type + +`free-agent` + +## Core Directive + +You are a task execution agent that interprets natural language requests and carries them out efficiently. You translate user intent into concrete actions, execute those actions, and report results clearly. + +**YOUR JOB:** +- ✅ Understand the natural language request +- ✅ Execute the requested task completely +- ✅ Report what you did clearly and concisely +- ✅ Ask for clarification only when genuinely ambiguous +- ✅ Handle errors gracefully +- ✅ Work independently without unnecessary back-and-forth + +**DO NOT:** +- ⌠Over-think simple requests +- ⌠Ask for permission to do what was explicitly requested +- ⌠Provide lengthy explanations unless something went wrong +- ⌠Suggest alternatives unless the requested approach fails +- ⌠Perform complex analysis (use specialized commands for that) + +## Input + +You will receive a request file containing: +- A natural language description of what to do +- Any relevant context or constraints + +## Scope + +### Ideal Use Cases +- **Git operations**: Clone repos, checkout branches, commit, push/pull +- **File operations**: Create, move, copy, delete, organize files/directories +- **Data processing**: Convert formats, parse data, generate reports +- **System tasks**: Run scripts, install packages, set up environments +- **Text processing**: Search/replace, format conversion, data extraction +- **Simple automation**: Batch operations, routine tasks + +### Out of Scope +- Complex software development (use specialized commands) +- Comprehensive code research/documentation (use doc-code commands) +- Multi-day projects requiring extensive planning +- Tasks requiring deep domain expertise + +## Execution Process + +### 1. Interpret the Request +- Parse the natural language to understand intent +- Identify specific action(s) required +- Determine if all necessary information is present + +### 2. Check for Ambiguity + +**Only ask for clarification if:** +- Request is genuinely ambiguous (e.g., "clone the repo" - which repo?) +- Critical information is missing (e.g., "checkout branch" - which branch?) +- Multiple reasonable interpretations exist + +**Do NOT ask if:** +- Request is clear even if informal +- You can reasonably infer the intent +- Request is specific enough to execute + +### 3. Execute the Task +- Perform the requested operations +- Handle errors appropriately +- Validate results when possible +- Track actions for reporting + +### 4. Document Everything +- Track all files created, modified, deleted +- Note all commands executed +- Capture any errors or warnings +- Prepare clear summary + +## Common Task Patterns + +### Git Operations +```bash +# Clone repository +git clone [url] [directory] + +# Checkout branch +git checkout [branch] + +# Commit changes +git add [files] +git commit -m "[message]" + +# Push/pull +git push origin [branch] +git pull origin [branch] +``` + +**Documentation:** +- Note repository URL and target directory +- Document branch names +- Include commit messages +- Track any conflicts or issues + +### File Operations +```bash +# Create directories +mkdir -p [path] + +# Copy files +cp -r [source] [destination] + +# Move files +mv [source] [destination] + +# Delete files +rm -rf [path] # Use with caution! + +# Organize files +# (custom logic based on request) +``` + +**Documentation:** +- List all files/directories affected +- Note source and destination paths +- Document any files that couldn't be processed +- Explain organization logic + +### Data Processing +```python +# Convert CSV to JSON +import csv, json +# ... implementation + +# Parse and transform data +# ... custom logic based on request + +# Generate reports +# ... custom logic +``` + +**Documentation:** +- Input file(s) and format +- Output file(s) and format +- Number of records processed +- Any data validation issues + +### System Tasks +```bash +# Install packages +pip install [package] +npm install [package] + +# Run scripts +python script.py +bash script.sh + +# Set up environments +python -m venv venv +source venv/bin/activate +``` + +**Documentation:** +- Commands executed +- Packages/tools installed +- Any version information +- Success/failure status + +## Error Handling + +When errors occur: + +1. **Set appropriate status** + - "error" if nothing completed + - "incomplete" if some work succeeded + +2. **Document the error** + - What failed + - Why it failed (if known) + - What impact it had + +3. **Provide context** + - What was attempted + - What succeeded before the error + - How to potentially fix or retry + +## JSON Output Requirements + +**Required Fields:** +- `command_type`: "free-agent" +- `status`: "complete", "incomplete", "user_query", or "error" +- `session_summary`: 1-3 sentence summary of what happened +- `files`: Document all file operations +- `comments`: Important notes, warnings, observations + +**For complete status:** +```json +{ + "command_type": "free-agent", + "status": "complete", + "session_summary": "Successfully cloned CMD-schema repository and organized 23 files", + "files": { + "created": [...], + "modified": [], + "deleted": [] + }, + "comments": [ + "Cloned from: https://github.com/example/CMD-schema.git", + "Repository contains 47 files, 2.3 MB", + "Organized schema files into schemas/ directory" + ] +} +``` + +**For user_query status:** +```json +{ + "command_type": "free-agent", + "status": "user_query", + "session_summary": "Need clarification on which repository to clone", + "queries_for_user": [ + { + "query_number": 1, + "query": "Which repository would you like to clone? Please provide the repository URL or name.", + "type": "text" + } + ], + "context": "User wants to clone a repository but didn't specify which one.", + "files": { + "created": [], + "modified": [], + "deleted": [] + }, + "comments": [] +} +``` + +**For incomplete status:** +```json +{ + "command_type": "free-agent", + "status": "incomplete", + "session_summary": "Processed 3 of 5 CSV files before encountering encoding error", + "files": { + "created": [ + { + "path": "output/data1.json", + "purpose": "Converted from data1.csv", + "type": "data" + }, + { + "path": "output/data2.json", + "purpose": "Converted from data2.csv", + "type": "data" + }, + { + "path": "output/data3.json", + "purpose": "Converted from data3.csv", + "type": "data" + } + ], + "modified": [], + "deleted": [] + }, + "errors": [ + { + "message": "UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0", + "type": "EncodingError", + "fatal": false, + "context": "Failed processing data4.csv - file appears to be UTF-16 encoded" + } + ], + "comments": [ + "Successfully processed: data1.csv, data2.csv, data3.csv", + "Failed on data4.csv: encoding error (file appears to be UTF-16)", + "Not attempted: data5.csv" + ], + "context": "Need to handle UTF-16 encoding for remaining files. Already processed: [data1.csv, data2.csv, data3.csv]" +} +``` + +**For error status:** +```json +{ + "command_type": "free-agent", + "status": "error", + "session_summary": "Failed to delete files: insufficient permissions", + "files": { + "created": [], + "modified": [], + "deleted": [] + }, + "errors": [ + { + "message": "Permission denied: /system/protected", + "type": "PermissionError", + "fatal": true, + "context": "Cannot delete files in /system/protected directory - requires root access" + } + ], + "comments": [ + "This directory requires elevated privileges", + "No files were deleted", + "Try running with appropriate permissions or use a different location" + ] +} +``` + +## Safety Guidelines + +1. **Destructive Operations** + - Be extra cautious with delete operations + - Verify paths before deleting + - Note what was deleted and why + +2. **System Modifications** + - Document all system-level changes + - Note tool/package versions + - Warn about potentially dangerous operations + +3. **Data Integrity** + - Validate data before transformations + - Keep backups when appropriate + - Note any data quality issues + +## Quality Checklist + +Before marking complete, verify: +- ✅ Task was executed as requested +- ✅ All file operations are documented +- ✅ Session summary is clear and concise +- ✅ Comments explain important decisions or issues +- ✅ Errors are handled gracefully with clear explanations +- ✅ JSON output includes all required fields +- ✅ Any assumptions are documented in comments + +## Example Scenarios + +### Scenario 1: Git Clone +**Request**: "Clone the project-templates repository" + +**Actions:** +1. Search for project-templates repository URL +2. Clone to current directory +3. Document repository details + +**Output**: +```json +{ + "command_type": "free-agent", + "status": "complete", + "session_summary": "Successfully cloned project-templates repository", + "files": { + "created": [ + { + "path": "project-templates/", + "purpose": "Cloned git repository", + "type": "code" + } + ], + "modified": [], + "deleted": [] + }, + "artifacts": {}, + "comments": [ + "Cloned from: https://github.com/example/project-templates.git", + "Repository size: 1.2 MB", + "Latest commit: 'Add React template' (3 days ago)", + "Contains 5 project templates" + ] +} +``` + +### Scenario 2: File Organization +**Request**: "Organize all images in this directory into folders by year" + +**Actions:** +1. Scan for image files +2. Read EXIF data or file timestamps +3. Create year folders +4. Move images +5. Report results + +**Output**: +```json +{ + "command_type": "free-agent", + "status": "complete", + "session_summary": "Organized 247 images into 4 year-based folders", + "files": { + "created": [ + { + "path": "2021/", + "purpose": "Images from 2021", + "type": "data" + }, + { + "path": "2022/", + "purpose": "Images from 2022", + "type": "data" + }, + { + "path": "2023/", + "purpose": "Images from 2023", + "type": "data" + }, + { + "path": "2024/", + "purpose": "Images from 2024", + "type": "data" + } + ], + "modified": [], + "deleted": [] + }, + "comments": [ + "Organized by year: 2021 (43 images), 2022 (89 images), 2023 (67 images), 2024 (48 images)", + "Used EXIF data where available, file modification time as fallback", + "3 files skipped: no valid date information (corrupted.jpg, temp.png, test.gif)" + ] +} +``` diff --git a/.claude/commands/generate-tasks.md b/.claude/commands/generate-tasks.md new file mode 100644 index 00000000..9d00bfd7 --- /dev/null +++ b/.claude/commands/generate-tasks.md @@ -0,0 +1,191 @@ +# Command: generate-tasks + +## Purpose + +Generate a detailed, hierarchical task list from an existing PRD. Tasks should guide a developer through implementation with clear, actionable steps. + +## Command Type + +`generate-tasks` + +## Input + +You will receive a request file containing: +- Reference to a specific PRD file (path or ID) +- Any additional context or constraints + +## Process + +### Phase 1: Analysis + +1. **Read the PRD** + - Locate and read the specified PRD file + - Understand functional requirements + - Identify user stories and acceptance criteria + - Note technical considerations + +2. **Assess Current Codebase** + - Review existing code structure + - Identify relevant existing components + - Understand architectural patterns + - Note relevant files that may need modification + - Identify utilities and libraries already in use + +3. **Identify Relevant Files** + - List files that will need to be created + - List files that will need to be modified + - Include corresponding test files + - Note the purpose of each file + +### Phase 2: Generate Parent Tasks + +4. **Create High-Level Tasks** + - Break the PRD into 4-7 major work streams + - Each parent task should be a significant milestone + - Examples: + - "Set up data models and database schema" + - "Implement backend API endpoints" + - "Create frontend components" + - "Add form validation and error handling" + - "Implement tests" + - "Add documentation" + +5. **Present to User** + - Generate the high-level tasks in the JSON output + - Set status to "user_query" + - Ask: "I have generated the high-level tasks. Ready to generate sub-tasks? Respond with 'Go' to proceed." + - Save context with the parent tasks + +### Phase 3: Generate Sub-Tasks + +6. **Wait for User Confirmation** + - Only proceed after user responds with "Go" or equivalent + +7. **Break Down Each Parent Task** + - Create 2-8 sub-tasks for each parent task + - Sub-tasks should be: + - Specific and actionable + - Able to be completed in 15-60 minutes + - Ordered logically (dependencies first) + - Clear enough for a junior developer + + **Sub-task Quality Guidelines:** + - Start with action verbs: "Create", "Implement", "Add", "Update", "Test" + - Include what and where: "Create UserProfile component in components/profile/" + - Reference existing patterns: "Following the pattern used in AuthForm component" + - Note dependencies: "After completing 1.2, update..." + +8. **Update Task List** + - Add all sub-tasks to the JSON output + - Link sub-tasks to parent tasks using parent_task_id + - All tasks should have status "pending" + +## Task ID Format + +- **Parent tasks**: X.0 (1.0, 2.0, 3.0, etc.) +- **Sub-tasks**: X.Y (1.1, 1.2, 1.3, etc.) +- Maximum depth: 2 levels (no sub-sub-tasks) + +## Task Structure in JSON + +```json +{ + "task_id": "1.0", + "description": "Set up data models and database schema", + "status": "pending", + "parent_task_id": null, + "notes": "" +}, +{ + "task_id": "1.1", + "description": "Create User model with fields: name, email, avatar_url, bio", + "status": "pending", + "parent_task_id": "1.0", + "notes": "Reference existing models in models/ directory" +} +``` + +## Relevant Files Documentation + +In your `comments` array, include a section listing relevant files: + +``` +"RELEVANT FILES:", +"- src/models/User.ts - Create new User model", +"- src/models/User.test.ts - Unit tests for User model", +"- src/api/users.ts - API endpoints for user operations", +"- src/api/users.test.ts - API endpoint tests", +"- src/components/UserProfile.tsx - New profile display component", +"- src/components/UserProfile.test.tsx - Component tests" +``` + +## JSON Output Requirements + +**Required Fields:** +- `command_type`: "generate-tasks" +- `status`: "complete" (after sub-tasks) or "user_query" (after parent tasks) +- `session_summary`: Brief summary of task generation +- `tasks`: Array of all tasks (parent and sub-tasks after completion) +- `comments`: Include relevant files list and important notes + +**For user_query status (after Phase 2):** +- `tasks`: Array with only parent tasks +- `queries_for_user`: Ask user to confirm before generating sub-tasks +- `context`: Save PRD analysis and parent tasks + +**Example Comments:** +- "Generated 5 parent tasks and 27 sub-tasks total" +- "Identified 12 files that need creation or modification" +- "Tasks assume use of existing authentication middleware" +- "Test tasks follow Jest/React Testing Library patterns used in codebase" + +## Quality Checklist + +Before marking complete, verify: +- ✅ All functional requirements from PRD are covered by tasks +- ✅ Tasks are ordered logically with dependencies first +- ✅ Each task is specific and actionable +- ✅ Parent tasks represent major milestones +- ✅ Sub-tasks can each be completed in reasonable time +- ✅ Testing tasks are included +- ✅ Task descriptions reference existing patterns where relevant +- ✅ All tasks use proper ID format +- ✅ Relevant files are identified with purposes +- ✅ JSON output includes all required fields + +## Example Task Breakdown + +**Parent Task:** +```json +{ + "task_id": "2.0", + "description": "Implement backend API endpoints", + "status": "pending", + "parent_task_id": null +} +``` + +**Sub-tasks:** +```json +{ + "task_id": "2.1", + "description": "Create GET /api/users/:id endpoint to retrieve user profile", + "status": "pending", + "parent_task_id": "2.0", + "notes": "Return user object with all fields from User model" +}, +{ + "task_id": "2.2", + "description": "Create PUT /api/users/:id endpoint to update user profile", + "status": "pending", + "parent_task_id": "2.0", + "notes": "Validate input, check authorization, update only allowed fields" +}, +{ + "task_id": "2.3", + "description": "Add authentication middleware to protect user endpoints", + "status": "pending", + "parent_task_id": "2.0", + "notes": "Use existing auth middleware pattern from api/auth.ts" +} +``` diff --git a/.claude/commands/jupyter-dev.md b/.claude/commands/jupyter-dev.md new file mode 100644 index 00000000..61eeb30b --- /dev/null +++ b/.claude/commands/jupyter-dev.md @@ -0,0 +1,480 @@ +# Command: jupyter-dev + +## Purpose + +Develop Jupyter notebooks following a standardized workflow that emphasizes: +- Organized directory structure with data, models, and output segregation +- Independent, self-contained cells that can run in any order +- Centralized utilities and imports via util.py +- Intermediate data caching for debugging and efficiency +- Clear markdown documentation preceding each code cell + +## Command Type + +`jupyter-dev` + +## Input + +You will receive a request file containing: +- Notebook development task description +- Project name (for util.py configuration) +- Specific analysis or computation requirements +- Input data files (optional) +- User preferences (optional) + +## Project Structure + +All notebooks must follow this directory structure: + +``` +notebooks/ +├── util.py # Centralized utilities and imports +├── .ipynb # Notebook files +├── data/ # Input data (experimental, omics, expression data) +├── datacache/ # JSON output from util.save() function +├── genomes/ # Genome files +├── models/ # COBRA/COBRApy models +└── nboutput/ # Non-JSON output (TSV, Excel, tables, etc.) +``` + +### Directory Purposes + +- **notebooks/**: Root directory containing all notebooks and util.py +- **data/**: All input data files (experimental data, omics data, expression data) +- **datacache/**: Intermediate JSON data saved via util.save() for cell independence +- **genomes/**: Genome files only +- **models/**: COBRA/COBRApy model files only +- **nboutput/**: Non-JSON output files (TSV, Excel, tables, plots, etc.) + +## util.py Structure + +The util.py file must follow this template: + +```python +import sys +import os +import json +from os import path + +# Add the parent directory to the sys.path +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +script_path = os.path.abspath(__file__) +script_dir = os.path.dirname(script_path) +base_dir = os.path.dirname(os.path.dirname(script_dir)) +folder_name = os.path.basename(script_dir) + +print(base_dir+"/KBUtilLib/src") +sys.path = [base_dir+"/KBUtilLib/src",base_dir+"/cobrakbase",base_dir+"/ModelSEEDpy/"] + sys.path + +# Import utilities with error handling +from kbutillib import NotebookUtils + +import hashlib +import pandas as pd +from modelseedpy import AnnotationOntology, MSPackageManager, MSMedia, MSModelUtil, MSBuilder, MSATPCorrection, MSGapfill, MSGrowthPhenotype, MSGrowthPhenotypes, ModelSEEDBiochem, MSExpression + +class NotebookUtil(NotebookUtils): + def __init__(self,**kwargs): + super().__init__( + notebook_folder=script_dir, + name="", + user="chenry", + retries=5, + proxy_port=None, + **kwargs + ) + + # PLACE ALL UTILITY FUNCTIONS NEEDED FOR NOTEBOOKS HERE + +# Initialize the NotebookUtil instance +util = NotebookUtil() +``` + +### Key Points for util.py + +1. **Replace ``** with the actual project name from user input +2. **Add all imports** needed by notebooks to this file +3. **Add all utility functions** as methods of the NotebookUtil class +4. **Keep it centralized**: All shared code goes here, not in notebook cells + +## Notebook Cell Design Pattern + +Every notebook must follow this strict cell pattern: + +### 1. Markdown Cell (Always First) +```markdown +## [Step Name/Purpose] + +[Explanation of what this code cell does and why] +- Key objective +- Input data used +- Output data produced +- Any important notes +``` + +### 2. Code Cell (Always Second) +```python +%run util.py + +# Load required data from previous steps +data1 = util.load('data1_name') +data2 = util.load('data2_name') + +# Perform analysis/computation +result = some_analysis(data1, data2) + +# Save intermediate results for cell independence +util.save('result_name', result) +``` + +### Critical Cell Design Rules + +1. **Every code cell starts with**: `%run util.py` + - This instantiates the util class + - This loads all imports + - This ensures cell independence + +2. **Load data at cell start**: Use `util.load('data_name')` for any data from previous cells + - Only load what this cell needs + - Data comes from datacache/ directory + +3. **Save data at cell end**: Use `util.save('data_name', data)` for outputs + - Save all intermediate results that other cells might need + - Only JSON-serializable data structures + - Saved to datacache/ directory + +4. **Cell independence**: Each cell should run independently + - Don't rely on variables from previous cells without loading them + - Don't assume cells run in order + - Enable debugging by re-running individual cells + +5. **Markdown precedes code**: Every code cell has a markdown cell explaining it + - What the cell does + - Why it's needed + - What data it uses and produces + +## Process + +### Phase 1: Setup Project Structure + +1. **Check for notebooks/ Directory** + - If `notebooks/` doesn't exist, create it + - If it exists, verify subdirectories + +2. **Create Required Subdirectories** + - Create `notebooks/data/` if missing + - Create `notebooks/datacache/` if missing + - Create `notebooks/genomes/` if missing + - Create `notebooks/models/` if missing + - Create `notebooks/nboutput/` if missing + +3. **Create or Validate util.py** + - If `notebooks/util.py` doesn't exist, create it from template + - Replace `` with actual project name + - If util.py exists, verify it has the NotebookUtil class + - Document whether created or validated + +### Phase 2: Understand Requirements + +4. **Analyze Task Description** + - Identify the scientific/analytical goal + - Determine required input data + - Identify computation steps needed + - Plan logical cell breakdown + - Determine what utility functions might be needed + +5. **Plan Notebook Structure** + - Break task into logical steps (cells) + - Identify data flow between cells + - Determine what gets saved/loaded at each step + - Plan utility functions for util.py + - Document the planned structure + +### Phase 3: Develop Utility Functions + +6. **Add Utility Functions to util.py** + - Add any custom functions needed by notebooks + - Add imports required for these functions + - Add functions as methods to NotebookUtil class + - Document each function with docstrings + - Keep functions general and reusable + +### Phase 4: Create/Modify Notebook + +7. **Create Notebook Cells** + - For each logical step: + - Create markdown cell explaining the step + - Create code cell with proper pattern: + - Start with `%run util.py` + - Load required data with util.load() + - Perform computation + - Save results with util.save() + - Follow cell independence principles + - Add clear variable names and comments + +8. **Organize Data Files** + - Move/reference input data to `notebooks/data/` + - Reference genome files from `notebooks/genomes/` + - Reference model files from `notebooks/models/` + - Save non-JSON output to `notebooks/nboutput/` + - Let util.save() handle datacache/ automatically + +### Phase 5: Validate and Document + +9. **Verify Notebook Standards** + - Every code cell starts with `%run util.py` + - Every code cell has preceding markdown explanation + - Data dependencies use util.load() + - Results saved with util.save() + - Cells can run independently + - All files in correct directories + +10. **Create Summary Documentation** + - Document notebook purpose and workflow + - List required input data and locations + - Describe each major step + - Note any manual setup required + - Include example usage + +### Phase 6: Save Structured Output + +11. **Save JSON Tracking File** + - Document all files created/modified + - List all utility functions added + - Describe notebook cell structure + - Note any issues or edge cases + - Include completion status + +## JSON Output Schema + +The command execution tracking file must follow this structure: + +```json +{ + "command_type": "jupyter-dev", + "status": "complete | incomplete | user_query | error", + "session_id": "string", + "parent_session_id": "string | null", + "session_summary": "Brief summary of notebook development work", + + "project": { + "name": "string - project name used in util.py", + "notebook_name": "string - name of notebook file", + "purpose": "string - what this notebook does" + }, + + "structure": { + "directories_created": ["data", "datacache", "genomes", "models", "nboutput"], + "util_py_status": "created | existed | modified", + "notebook_path": "notebooks/.ipynb" + }, + + "notebook_cells": [ + { + "cell_number": 1, + "type": "markdown | code", + "purpose": "Description of what this cell does", + "data_loaded": ["data1", "data2"], + "data_saved": ["result1"] + } + ], + + "utility_functions": [ + { + "name": "function_name", + "purpose": "What this utility function does", + "added_to_util_py": true + } + ], + + "files": { + "created": [ + { + "path": "notebooks/util.py", + "purpose": "Centralized utilities and imports", + "type": "code" + } + ], + "modified": [ + { + "path": "notebooks/analysis.ipynb", + "changes": "Added 5 cells for data loading and analysis" + } + ], + "data_files": [ + { + "path": "notebooks/data/experimental_data.csv", + "purpose": "Input experimental data", + "type": "input" + } + ] + }, + + "artifacts": { + "notebook_filename": "notebooks/.ipynb", + "util_py_path": "notebooks/util.py", + "cell_count": 10, + "utility_function_count": 3 + }, + + "validation": { + "all_cells_have_markdown": true, + "all_cells_start_with_run_util": true, + "data_loading_uses_util_load": true, + "data_saving_uses_util_save": true, + "cells_independent": true, + "files_in_correct_directories": true + }, + + "comments": [ + "Created notebook structure with 5 analysis steps", + "Added 3 utility functions for data processing", + "All cells follow independence pattern with util.load/save", + "Input data placed in notebooks/data/", + "Output tables saved to notebooks/nboutput/" + ], + + "queries_for_user": [], + + "errors": [] +} +``` + +## Command JSON Output Requirements + +Your command execution JSON output must include: + +**Required Fields:** +- `command_type`: "jupyter-dev" +- `status`: "complete", "user_query", or "error" +- `session_id`: Session ID for this execution +- `session_summary`: Brief summary of notebook development +- `project`: Project name and notebook details +- `structure`: Directory and util.py status +- `files`: All files created, modified, or referenced +- `artifacts`: Paths to notebook and util.py +- `validation`: Checklist confirming standards followed +- `comments`: Notes about development process + +**For user_query status:** +- `queries_for_user`: Questions needing clarification +- `context`: Save partial work and notebook state + +**Example Comments:** +- "Created notebooks directory structure with all required subdirectories" +- "Generated util.py with project name 'MetabolicAnalysis'" +- "Created notebook with 8 cells following independence pattern" +- "Added 4 utility functions for COBRA model manipulation" +- "All intermediate results saved to datacache/ for cell independence" +- "Placed genome files in genomes/, model files in models/" + +## Design Principles + +### Cell Independence Philosophy + +The notebook design prioritizes **cell independence** for several critical reasons: + +1. **Debugging Efficiency**: Re-run individual cells without executing entire notebook +2. **Time Savings**: Skip expensive computations by loading cached results +3. **Error Recovery**: Recover from failures without losing all progress +4. **Experimentation**: Test variations by modifying single cells +5. **Collaboration**: Others can understand and modify individual steps + +### Implementation Strategy + +- **util.load()** and **util.save()** create checkpoints +- **datacache/** stores intermediate results as JSON +- **%run util.py** ensures consistent environment +- **Markdown cells** provide context for each step + +### When to Save Data + +Save data when: +- Results took significant time to compute +- Data will be used by multiple subsequent cells +- Intermediate results are worth preserving +- Enabling cell re-runs would save time + +Don't save data when: +- Quick computations (< 1 second) +- Data only used in next cell +- Data is not JSON-serializable (save to nboutput/ instead) + +## Utility Function Guidelines + +Add functions to util.py when: +- Code is used by multiple cells +- Complex operations that need documentation +- Interactions with external systems (APIs, databases) +- Data transformations used repeatedly +- Model-specific operations + +Keep in notebooks when: +- Code is cell-specific analysis +- One-time exploratory code +- Visualization/plotting specific to that cell +- Simple operations that don't need abstraction + +## Quality Checklist + +Before marking complete, verify: +- ✅ notebooks/ directory exists with all 5 subdirectories +- ✅ util.py exists and has correct project name +- ✅ util.py contains NotebookUtil class with needed functions +- ✅ Every code cell starts with `%run util.py` +- ✅ Every code cell has preceding markdown explanation +- ✅ Data dependencies use util.load() +- ✅ Results saved with util.save() where appropriate +- ✅ Cells can run independently (tested) +- ✅ Input data in data/ directory +- ✅ Models in models/ directory +- ✅ Genomes in genomes/ directory +- ✅ Non-JSON output in nboutput/ directory +- ✅ JSON output handled by util.save() to datacache/ +- ✅ Markdown cells explain reasoning and purpose +- ✅ All imports in util.py, not scattered in cells +- ✅ Utility functions documented with docstrings + +## Error Handling + +Handle these scenarios gracefully: + +1. **Missing Dependencies**: If KBUtilLib or ModelSEEDpy not available, note in errors +2. **Existing Files**: Don't overwrite util.py if it already exists; validate instead +3. **Non-JSON Data**: Guide user to save to nboutput/ and load manually +4. **Complex Analysis**: Break into multiple cells for independence +5. **Long-Running Cells**: Emphasize saving intermediate results + +## Privacy and Security Considerations + +- Don't include API keys or credentials in util.py or notebooks +- Use environment variables or config files for sensitive data +- Document if manual credential setup is needed +- Don't log sensitive data in datacache/ files +- Note if data files contain sensitive information + +## Example Workflow + +For a typical metabolic modeling notebook: + +1. **Cell 1**: Load genome data from genomes/ + - Markdown: Explain which genome and why + - Code: Load, parse, save processed genome data + +2. **Cell 2**: Load COBRA model from models/ + - Markdown: Explain model selection and purpose + - Code: Load model, save to datacache + +3. **Cell 3**: Load experimental data from data/ + - Markdown: Describe experimental conditions + - Code: Load CSV, process, save data structure + +4. **Cell 4**: Run flux balance analysis + - Markdown: Explain FBA parameters and objectives + - Code: Load model, run FBA, save results + +5. **Cell 5**: Generate result tables + - Markdown: Describe what tables show + - Code: Load FBA results, create tables, save to nboutput/ + +Each cell independent, each with clear purpose, each properly cached. diff --git a/.claude/commands/run_headless.md b/.claude/commands/run_headless.md new file mode 100644 index 00000000..b3a272fd --- /dev/null +++ b/.claude/commands/run_headless.md @@ -0,0 +1,158 @@ +# Command: run_headless + +## Purpose + +Execute Claude Code commands in autonomous headless mode with comprehensive JSON output. This command enables Claude to run structured tasks without interactive terminal access, producing complete documentation of all actions taken. + +## Overview + +You are running in headless mode to execute structured commands. You will receive input that may include: +1. **Claude Commands**: One or more commands to be executed (e.g., create-prd, generate-tasks, doc-code-for-dev) +2. **User Prompt**: Description of the work to be done, which may: + - Reference an existing PRD by name (e.g., "user-profile-editing") + - Contain a complete new feature description that should be saved as a PRD +3. **PRD Reference Handling**: When a PRD name is referenced: + - Look for `agent-io/prds//humanprompt.md` + - Look for `agent-io/prds//fullprompt.md` if present + - These files provide the detailed context for the work +4. **PRD Storage**: When a user prompt is provided without a PRD name: + - Analyze the prompt to create a descriptive PRD name (use kebab-case) + - Save the user prompt to `agent-io/prds//humanprompt.md` + - Document the PRD name in your output for future reference + +Your job is to execute the command according to the instructions and produce a comprehensive JSON output file. + +## Critical Principles for Headless Operation + +### User Cannot See Terminal +- The user has NO access to your terminal output +- ALL relevant information MUST go in the JSON output file +- Do not assume the user saw anything you did +- Every action, decision, and result must be documented in `claude-output.json` + +### Autonomous Execution +- Execute tasks independently without asking for permission +- Only ask questions when genuinely ambiguous or missing critical information +- Make reasonable assumptions and document them in comments +- Complete as much work as possible before requesting user input +- Work proactively to accomplish the full scope of the command + +## Command Execution Flow + +Follow this process for all headless executions: + +### 1. Parse Input and Handle PRDs +- Parse the input to identify: + - Which Claude commands to execute + - The user prompt describing the work + - Whether a PRD name is referenced +- **If a PRD name is referenced**: + - Read the PRD files from `agent-io/prds//` + - Use humanprompt.md and fullprompt.md (if available) as context +- **If user prompt provided without PRD name**: + - Create a descriptive PRD name based on the prompt content (use kebab-case) + - Create directory `agent-io/prds//` + - Save the user prompt to `agent-io/prds//humanprompt.md` + - Document the PRD name in your output +- If resuming from a previous session, review the parent session context + +### 2. Execute Command +- Follow the instructions in the command file +- Apply the principles from the system prompt +- Work autonomously as much as possible +- Track all actions as you work + +### 3. Track Everything +- Track all actions in memory as you work +- Build up the JSON output structure continuously +- Document files created, modified, or deleted +- Record task progress and status changes +- Capture all decisions and assumptions + +### 4. Handle User Queries (if needed) +- If you need user input, prepare clear questions +- Format questions according to the JSON schema +- Save complete context for resumption +- Set status to "user_query" +- Ensure session_id is included for continuity + +### 5. Write JSON Output +- Write the complete JSON to `claude-output.json` +- Ensure all required fields are present +- Validate JSON structure before writing +- Include comprehensive session_summary + +## Example Headless Session + +### Example 1: New PRD Creation + +**Input:** +- Commands: `["create-prd"]` +- User prompt: "Add user profile editing feature with avatar upload and bio section" +- PRD name: Not provided + +**Execution Process:** +1. Parse input - no PRD name provided, so create one +2. Generate PRD name: "user-profile-editing" +3. Create directory: `agent-io/prds/user-profile-editing/` +4. Save user prompt to `agent-io/prds/user-profile-editing/humanprompt.md` +5. Ask clarifying questions (if needed) by setting status to "user_query" +6. Generate enhanced PRD content +7. Save to `agent-io/prds/user-profile-editing/fullprompt.md` +8. Create comprehensive JSON output with: + - Status: "complete" + - Session ID: (provided by Claude Code automatically) + - Parent session ID: null (this is a new session) + - Session summary explaining what was accomplished + - Files created: humanprompt.md, fullprompt.md, data.json + - PRD name documented in artifacts + - Any relevant comments, assumptions, or observations + +### Example 2: Using Existing PRD + +**Input:** +- Commands: `["generate-tasks"]` +- User prompt: "Generate implementation tasks for user-profile-editing" +- PRD name: "user-profile-editing" (referenced in prompt) + +**Execution Process:** +1. Parse input - PRD name "user-profile-editing" identified +2. Read `agent-io/prds/user-profile-editing/humanprompt.md` +3. Read `agent-io/prds/user-profile-editing/fullprompt.md` (if exists) +4. Use PRD context to generate detailed task list +5. Save tasks to `agent-io/prds/user-profile-editing/data.json` +6. Create comprehensive JSON output with task list and references + +### The user workflow: +- User reads `claude-output.json` to understand everything you did +- User can review created files based on paths in JSON +- User can resume work by creating new session with parent_session_id + +### If clarification is needed: +- Set status to "user_query" +- Include session_id in output +- Add queries_for_user array with clear, specific questions +- When user provides answers in a new session, that session will have parent_session_id pointing to this session +- Claude Code uses the session chain to maintain full context + +## Output Requirements + +Always output to: `claude-output.json` in the working directory + +The JSON must include: +- All required fields for the command type and status +- Complete file tracking (created, modified, deleted) +- Task progress if applicable +- Session information for continuity +- Comments explaining decisions and assumptions +- Any errors or warnings encountered + +## Best Practices for Headless Execution + +- **Be Specific**: Include file paths, line numbers, function names +- **Be Complete**: Don't leave out details assuming the user knows them +- **Be Clear**: Write for someone who wasn't watching you work +- **Be Actionable**: Comments should help the user understand next steps +- **Be Honest**: If something is incomplete or uncertain, say so +- **Be Thorough**: Document every action taken, no matter how small +- **Be Proactive**: Complete as much work as possible before asking questions diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 87de0099..ffde9f98 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -3,6 +3,8 @@ name: Run Pre-Commit on: pull_request: {} push: + paths-ignore: + - 'examples/**' branches: - dev - main @@ -13,7 +15,7 @@ jobs: strategy: matrix: os: [ubuntu-latest] - python-version: ['3.8', '3.9', '3.10'] + python-version: ['3.9', '3.10', '3.11'] steps: - uses: actions/checkout@v2 - uses: actions/setup-python@v3 diff --git a/.github/workflows/tox.yml b/.github/workflows/tox.yml new file mode 100644 index 00000000..c3d816d0 --- /dev/null +++ b/.github/workflows/tox.yml @@ -0,0 +1,28 @@ +name: Run Tox + +on: + pull_request: {} + push: + branches: [main] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + python-version: ['3.9', '3.10', '3.11'] + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip setuptools wheel build + python -m pip install tox tox-gh-actions + - name: Test with tox + run: | + tox + python -m build . diff --git a/.gitignore b/.gitignore index 6390162b..591c53c2 100644 --- a/.gitignore +++ b/.gitignore @@ -5,10 +5,8 @@ __pycache__/ *.py[cod] *$py.class - # C extensions *.so - # Distribution / packaging .Python build/ @@ -29,17 +27,14 @@ share/python-wheels/ .installed.cfg *.egg MANIFEST - # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec - # Installer logs pip-log.txt pip-delete-this-directory.txt - # Unit test / coverage reports htmlcov/ .tox/ @@ -53,81 +48,70 @@ coverage.xml *.py,cover .hypothesis/ .pytest_cache/ - # Translations *.mo *.pot - # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal - # Flask stuff: instance/ .webassets-cache - # Scrapy stuff: .scrapy - # Sphinx documentation docs/_build/ - # PyBuilder target/ - # Jupyter Notebook .ipynb_checkpoints .idea - # IPython profile_default/ ipython_config.py - # pyenv .python-version - # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock - # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ - # Celery stuff celerybeat-schedule celerybeat.pid - # SageMath parsed files *.sage.py - # Environments .env .venv +activate.sh env/ venv/ ENV/ env.bak/ venv.bak/ - # Spyder project settings .spyderproject .spyproject - # Rope project settings .ropeproject - # mkdocs documentation /site - # mypy .mypy_cache/ .dmypy.json dmypy.json - # Pyre type checker .pyre/ +.pydevproject +.settings/* +*data/* +*.lp + +# Cursor workspace files +*.code-workspace diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 04cde634..325706ab 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,7 +21,9 @@ repos: args: - --pytest-test-first - id: check-json + exclude: examples/ - id: pretty-format-json + exclude: examples/ args: - --autofix - --top-keys=_id diff --git a/.travis.yml b/.travis.yml index e72cfaff..75b2eb81 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,8 +1,8 @@ language: python python: - - 3.6 - - 3.7 - - 3.8 + - 3.9 + - 3.10 + - 3.11 before_install: - python --version - pip install -U pip diff --git a/README.rst b/README.rst index 6f380d9a..3d491ec3 100644 --- a/README.rst +++ b/README.rst @@ -25,6 +25,10 @@ ________________________________________________________________________ :target: https://pepy.tech/project/modelseedpy :alt: Downloads +.. image:: https://img.shields.io/badge/code%20style-black-000000.svg + :target: https://github.com/ambv/black + :alt: Black + Metabolic modeling is an pivotal method for computational research in synthetic biology and precision medicine. The metabolic models, such as the constrint-based flux balance analysis (FBA) algorithm, are improved with comprehensive datasets that capture more metabolic chemistry in the model and improve the accuracy of simulation predictions. We therefore developed ModelSEEDpy as a comprehensive suite of packages that bootstrap metabolic modeling with the ModelSEED Database (`Seaver et al., 2021 `_ ). These packages parse and manipulate (e.g. gapfill missing reactions or calculated chemical properties of metabolites), constrain (with kinetic, thermodynamics, and nutrient uptake), and simulate cobrakbase models (both individual models and communities). This is achieved by standardizing COBRA models through the ``cobrakbase`` module into a form that is amenable with the KBase/ModelSEED ecosystem. These functionalities are exemplified in `Python Notebooks `_ . Please submit errors, inquiries, or suggestions as `GitHub issues `_ where they can be addressed by our developers. @@ -33,11 +37,11 @@ Metabolic modeling is an pivotal method for computational research in synthetic Installation ---------------------- -ModelSEEDpy will soon be installable via the ``PyPI`` channel:: +PIP (latest stable version 0.4.0):: pip install modelseedpy -but, until then, the repository must cloned:: +GitHub dev build (latest working version):: git clone https://github.com/ModelSEED/ModelSEEDpy.git @@ -51,8 +55,3 @@ The associated ModelSEED Database, which is required for a few packages, is simp git clone https://github.com/ModelSEED/ModelSEEDDatabase.git and the path to this repository is passed as an argument to the corresponding packages. - -**Windows users** must separately install the ``pyeda`` module: 1) download the appropriate wheel for your Python version from `this website `_ ; and 2) install the wheel through the following commands in a command prompt/powershell console:: - - cd path/to/pyeda/wheel - pip install pyeda_wheel_name.whl diff --git a/examples/Flux Analysis/FullThermodynamicsExample.ipynb b/examples/Flux Analysis/FullThermodynamicsExample.ipynb index b5ffac67..776c4933 100644 --- a/examples/Flux Analysis/FullThermodynamicsExample.ipynb +++ b/examples/Flux Analysis/FullThermodynamicsExample.ipynb @@ -1373,7 +1373,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.9" + "version": "3.9.12" } }, "nbformat": 4, diff --git a/examples/MSExpression/MSExpression_Usage.ipynb b/examples/MSExpression/MSExpression_Usage.ipynb new file mode 100644 index 00000000..4881979f --- /dev/null +++ b/examples/MSExpression/MSExpression_Usage.ipynb @@ -0,0 +1,497 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MSExpression Usage Guide\n", + "\n", + "This notebook demonstrates how to use the refactored MSExpression class for managing gene/protein expression data in ModelSEEDpy.\n", + "\n", + "## Overview\n", + "\n", + "MSExpression now uses pandas DataFrame internally for efficient numerical operations while maintaining custom data structures (MSCondition, MSExpressionFeature) for metadata tracking.\n", + "\n", + "**Key Features:**\n", + "- Fast vectorized operations using pandas\n", + "- Easy data loading from DataFrames, Excel, or CSV files\n", + "- Statistical analysis across conditions\n", + "- GPR-based reaction expression calculation\n", + "- Flexible data export" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from modelseedpy.multiomics.msexpression import MSExpression\n", + "from modelseedpy.core.msgenome import MSGenome, MSFeature\n", + "from cobra import Model, Reaction, Gene" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Basic Usage: Loading Expression Data\n", + "\n", + "### 1.1 Creating a Sample Dataset\n", + "\n", + "Let's create some sample expression data to work with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "source": [ + "# Create sample expression data\n", + "expression_data = pd.DataFrame({\n", + " 'gene_id': ['gene1', 'gene2', 'gene3', 'gene4', 'gene5'],\n", + " 'control_1': [10.5, 15.2, 8.7, 22.1, 5.3],\n", + " 'control_2': [11.2, 14.8, 9.1, 21.5, 5.8],\n", + " 'treatment_1': [45.3, 18.2, 7.1, 35.7, 2.1],\n", + " 'treatment_2': [43.8, 19.1, 6.9, 36.2, 2.5],\n", + " 'description': ['Growth factor', 'Transporter', 'Kinase', 'Transcription factor', 'Receptor']\n", + "})\n", + "\n", + "print(\"Sample Expression Data:\")\n", + "expression_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.2 Creating a Genome with Features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "source": [ + "# Create a genome and add features\n", + "genome = MSGenome()\n", + "for gene_id in expression_data['gene_id']:\n", + " genome.features.append(MSFeature(gene_id, ''))\n", + "\n", + "print(f\"Created genome with {len(genome.features)} features\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.3 Loading Data from DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "source": [ + "# Load expression data from DataFrame\n", + "expr = MSExpression.from_dataframe(\n", + " expression_data,\n", + " genome=genome,\n", + " id_column='gene_id',\n", + " description_column='description',\n", + " type='RelativeAbundance'\n", + ")\n", + "\n", + "print(f\"Loaded expression data:\")\n", + "print(f\" - Features: {len(expr.features)}\")\n", + "print(f\" - Conditions: {len(expr.conditions)}\")\n", + "print(f\" - Data shape: {expr._data.shape}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.4 Accessing Expression Values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "source": [ + "# Access individual values\n", + "value = expr.get_value('gene1', 'control_1')\n", + "print(f\"gene1 expression in control_1: {value}\")\n", + "\n", + "# Access using feature object\n", + "feature = expr.features.get_by_id('gene2')\n", + "value = feature.get_value('treatment_1')\n", + "print(f\"gene2 expression in treatment_1: {value}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Statistical Analysis\n", + "\n", + "MSExpression provides efficient statistical methods for analyzing expression across conditions." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.1 Basic Statistics per Condition" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "source": [ + "# Get statistics for each condition\n", + "for condition in expr.conditions:\n", + " print(f\"\\nStatistics for {condition.id}:\")\n", + " print(f\" Mean: {condition.average_value():.2f}\")\n", + " print(f\" Min: {condition.lowest_value():.2f}\")\n", + " print(f\" Max: {condition.highest_value():.2f}\")\n", + " print(f\" Sum: {condition.sum_value():.2f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.2 Z-score Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "source": [ + "# Calculate z-score thresholds\n", + "condition = expr.conditions.get_by_id('control_1')\n", + "\n", + "z1 = condition.value_at_zscore(1.0)\n", + "z2 = condition.value_at_zscore(2.0)\n", + "\n", + "print(f\"Z-score thresholds for {condition.id}:\")\n", + "print(f\" 1σ: {z1:.2f}\")\n", + "print(f\" 2σ: {z2:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.3 Comparing Conditions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "source": [ + "# Compare treatment vs control for each gene\n", + "print(\"\\nFold changes (treatment_1 / control_1):\")\n", + "for feature in expr.features:\n", + " control_val = feature.get_value('control_1')\n", + " treatment_val = feature.get_value('treatment_1')\n", + " \n", + " if control_val and control_val > 0:\n", + " fold_change = treatment_val / control_val\n", + " print(f\" {feature.id}: {fold_change:.2f}x\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Advanced Operations\n", + "\n", + "### 3.1 Working with Gene-Protein-Reaction (GPR) Rules\n", + "\n", + "MSExpression can calculate reaction-level expression from gene-level data using GPR rules." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "source": [ + "# Create a simple metabolic model\n", + "model = Model('example_model')\n", + "\n", + "# Add genes\n", + "for gene_id in ['gene1', 'gene2', 'gene3']:\n", + " model.genes.append(Gene(gene_id))\n", + "\n", + "# Add reactions with GPR rules\n", + "rxn1 = Reaction('rxn1')\n", + "rxn1.gene_reaction_rule = 'gene1' # Simple single gene\n", + "\n", + "rxn2 = Reaction('rxn2')\n", + "rxn2.gene_reaction_rule = 'gene1 and gene2' # AND rule (min)\n", + "\n", + "rxn3 = Reaction('rxn3')\n", + "rxn3.gene_reaction_rule = 'gene1 or gene3' # OR rule (sum)\n", + "\n", + "model.add_reactions([rxn1, rxn2, rxn3])\n", + "\n", + "print(f\"Created model with {len(model.reactions)} reactions\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "source": [ + "# Build reaction expression from gene expression\n", + "rxn_expr = expr.build_reaction_expression(model, default=0.0)\n", + "\n", + "print(\"\\nReaction expression values:\")\n", + "for rxn_feature in rxn_expr.features:\n", + " rxn_id = rxn_feature.id\n", + " gpr = model.reactions.get_by_id(rxn_id).gene_reaction_rule\n", + " control_val = rxn_feature.get_value('control_1')\n", + " treatment_val = rxn_feature.get_value('treatment_1')\n", + " \n", + " print(f\"\\n{rxn_id} (GPR: {gpr}):\")\n", + " print(f\" Control: {control_val:.2f}\")\n", + " print(f\" Treatment: {treatment_val:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.2 Understanding GPR Logic\n", + "\n", + "- **OR**: Sum of gene values (isoenzymes)\n", + "- **AND**: Minimum gene value (enzyme complex)\n", + "- **Single gene**: Direct gene value" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Data Export\n", + "\n", + "### 4.1 Export to DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "source": [ + "# Get DataFrame with feature_id as index (default)\n", + "df_index = expr.get_dataframe()\n", + "print(\"DataFrame with index:\")\n", + "print(df_index.head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "source": [ + "# Get DataFrame with feature_id as column\n", + "df_reset = expr.get_dataframe(reset_index=True)\n", + "print(\"\\nDataFrame with reset index:\")\n", + "print(df_reset.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.2 Export to CSV/Excel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "source": [ + "# Export to CSV\n", + "df = expr.get_dataframe(reset_index=True)\n", + "# df.to_csv('expression_data.csv', index=False)\n", + "\n", + "# Export to Excel\n", + "# df.to_excel('expression_data.xlsx', index=False)\n", + "\n", + "print(\"Data can be exported using standard pandas methods\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Handling Missing Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "source": [ + "# Create data with missing values\n", + "data_with_nan = pd.DataFrame({\n", + " 'gene_id': ['gene1', 'gene2', 'gene3'],\n", + " 'cond1': [10.0, np.nan, 15.0],\n", + " 'cond2': [20.0, 25.0, np.nan]\n", + "})\n", + "\n", + "genome_test = MSGenome()\n", + "for g in ['gene1', 'gene2', 'gene3']:\n", + " genome_test.features.append(MSFeature(g, ''))\n", + "\n", + "expr_nan = MSExpression.from_dataframe(\n", + " data_with_nan, genome=genome_test, id_column='gene_id'\n", + ")\n", + "\n", + "# Missing values are returned as None\n", + "print(\"Handling missing values:\")\n", + "print(f\"gene2 in cond1: {expr_nan.get_value('gene2', 'cond1')}\") # None\n", + "print(f\"gene3 in cond2: {expr_nan.get_value('gene3', 'cond2')}\") # None\n", + "print(f\"gene1 in cond1: {expr_nan.get_value('gene1', 'cond1')}\") # 10.0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Migration Guide\n", + "\n", + "If you're updating code from the old MSExpression implementation:\n", + "\n", + "### What Changed:\n", + "\n", + "1. **`get_dataframe()` format**: Now returns feature_id as index by default (use `reset_index=True` for old behavior)\n", + "2. **`feature.values` attribute**: No longer exists - values are stored in parent DataFrame\n", + "\n", + "### Migration Examples:\n", + "\n", + "```python\n", + "# OLD: Access feature.values directly\n", + "# value = feature.values[condition]\n", + "\n", + "# NEW: Use get_value() method\n", + "value = feature.get_value(condition)\n", + "\n", + "# OLD: get_dataframe() returned feature_id column\n", + "# df = expr.get_dataframe()\n", + "# df['feature_id']\n", + "\n", + "# NEW: get_dataframe() has feature_id as index by default\n", + "df = expr.get_dataframe() # feature_id is index\n", + "# Or use reset_index=True for old behavior\n", + "df = expr.get_dataframe(reset_index=True)\n", + "df['feature_id']\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Performance Tips\n", + "\n", + "The DataFrame-based implementation provides significant performance improvements:\n", + "\n", + "### Best Practices:\n", + "\n", + "1. **Bulk loading**: Use `from_dataframe()` instead of iterative `add_value()` calls\n", + "2. **Statistical operations**: Use MSCondition methods (they use vectorized pandas operations)\n", + "3. **Large datasets**: The DataFrame backend scales efficiently to thousands of features/conditions\n", + "\n", + "### Performance Example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "source": [ + "import time\n", + "\n", + "# Create large dataset\n", + "n_genes = 5000\n", + "n_conditions = 20\n", + "\n", + "large_data = pd.DataFrame(\n", + " np.random.randn(n_genes, n_conditions + 1),\n", + " columns=['gene_id'] + [f'cond_{i}' for i in range(n_conditions)]\n", + ")\n", + "large_data['gene_id'] = [f'gene_{i}' for i in range(n_genes)]\n", + "\n", + "# Time bulk loading\n", + "large_genome = MSGenome()\n", + "for i in range(n_genes):\n", + " large_genome.features.append(MSFeature(f'gene_{i}', ''))\n", + "\n", + "start = time.time()\n", + "large_expr = MSExpression.from_dataframe(\n", + " large_data, genome=large_genome, id_column='gene_id'\n", + ")\n", + "load_time = time.time() - start\n", + "\n", + "print(f\"\\nPerformance Test:\")\n", + "print(f\"Loaded {n_genes} genes × {n_conditions} conditions in {load_time:.3f} seconds\")\n", + "print(f\"Data shape: {large_expr._data.shape}\")\n", + "\n", + "# Time statistical operations\n", + "start = time.time()\n", + "for condition in large_expr.conditions:\n", + " avg = condition.average_value()\n", + "stats_time = time.time() - start\n", + "\n", + "print(f\"Computed statistics for {n_conditions} conditions in {stats_time:.3f} seconds\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "The refactored MSExpression class provides:\n", + "- ✅ Efficient DataFrame-based storage\n", + "- ✅ Fast vectorized statistical operations\n", + "- ✅ Easy data import/export\n", + "- ✅ Full backward compatibility (with minor changes)\n", + "- ✅ Type hints for better IDE support\n", + "- ✅ Comprehensive test coverage\n", + "\n", + "For more information, see the ModelSEEDpy documentation." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/Model Reconstruction/ATPGapfilling.ipynb b/examples/Model Reconstruction/ATPGapfilling.ipynb index f0116989..d236d609 100644 --- a/examples/Model Reconstruction/ATPGapfilling.ipynb +++ b/examples/Model Reconstruction/ATPGapfilling.ipynb @@ -526,7 +526,13 @@ "cell_type": "code", "execution_count": 60, "id": "6ade9096-f3f4-40f8-a1ea-53b5b63ec2c0", - "metadata": {}, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, "outputs": [ { "name": "stderr", @@ -1174,123 +1180,417 @@ }, { "cell_type": "code", - "execution_count": 67, - "id": "7aba6de8-9252-4980-95b0-bd1a72db2e05", + "execution_count": 1, + "id": "e24d8e82-357a-4658-9362-6073f502b6bc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "modelseedpy 0.2.2\n" + ] + } + ], + "source": [ + "import modelseedpy" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1080bc7b-58c2-4105-91a2-2defaa8a1c92", "metadata": {}, "outputs": [], "source": [ - "atp_correction.apply_growth_media_gapfilling()" + "%run /home/fliu/workspace/python3/ModelSEEDpy/tests/core/test_msatpcorreption.py" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "3ee9a1dd-9b8c-4204-b846-609cecebffc7", + "metadata": {}, + "outputs": [], + "source": [ + "def get_model(ko):\n", + " def _method(ko=ko, added_compounds=None, added_reactions=None):\n", + " if ko is None:\n", + " ko = []\n", + " with open(\n", + " '/home/fliu/workspace/python3/ModelSEEDpy/tests/test_data/e_coli_core.json',\n", + " \"r\",\n", + " ) as fh:\n", + " model_json = json.load(fh)\n", + " model_json[\"compartments\"] = {\n", + " k + \"0\": v for (k, v) in model_json[\"compartments\"].items()\n", + " }\n", + " metabolites = {}\n", + " for m in model_json[\"metabolites\"]:\n", + " m[\"id\"] += \"0\"\n", + " m[\"compartment\"] += \"0\"\n", + " metabolites[m[\"id\"]] = m\n", + " for r in model_json[\"reactions\"]:\n", + " r[\"metabolites\"] = {i + \"0\": v for (i, v) in r[\"metabolites\"].items()}\n", + " compartments = set(\n", + " [metabolites[k][\"compartment\"] for k in r[\"metabolites\"].keys()]\n", + " )\n", + " if r[\"id\"].endswith(\"_e\"):\n", + " r[\"id\"] += \"0\"\n", + " elif len(compartments) == 1:\n", + " r[\"id\"] += \"_\" + list(compartments)[0]\n", + " else:\n", + " r[\"id\"] += (\n", + " \"_\" + \"c0\"\n", + " ) # hack cause there is only combo between e0 and c0\n", + "\n", + " model_json[\"reactions\"] = [\n", + " x for x in model_json[\"reactions\"] if x[\"id\"] not in ko\n", + " ]\n", + "\n", + " if added_compounds:\n", + " for o in added_compounds:\n", + " model_json[\"metabolites\"].append(o)\n", + " if added_reactions:\n", + " for o in added_reactions:\n", + " model_json[\"reactions\"].append(o)\n", + " model = cobra.io.from_json(json.dumps(model_json))\n", + " model.reactions.ATPM_c0.lower_bound = 0\n", + " model.reactions.ATPM_c0.upper_bound = 1000\n", + " return model\n", + "\n", + " return _method(ko)" ] }, { "cell_type": "code", - "execution_count": 18, - "id": "e8107ba2-f470-4e05-8b80-731fc00febe7", + "execution_count": 45, + "id": "928bb140-9110-4a1a-b750-dbd9d6a2acc6", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "logger = logging.getLogger(__name__)" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "95db6e6f-bedc-4c0d-9e73-c6eec5365c16", + "metadata": {}, + "outputs": [], + "source": [ + "model = get_model([\"NADH16_c0\", \"CYTBD_c0\", \"O2t_c0\", \"GLCpts_c0\"])\n", + "with open('/home/fliu/workspace/python3/ModelSEEDpy/tests/test_data/template_core_bigg.json', 'r') as fh:\n", + " template = MSTemplateBuilder.from_dict(json.load(fh)).build()\n", + "media_glucose_aerobic = MSMedia.from_dict(\n", + " {\n", + " \"glc__D\": (-1, 1000),\n", + " \"o2\": (-1000, 1000),\n", + " \"h\": (-1000, 1000),\n", + " \"h2o\": (-1000, 1000),\n", + " }\n", + " )\n", + "media_glucose_aerobic.id = 'glc/o2'\n", + "media_acetate_aerobic = MSMedia.from_dict(\n", + " {\n", + " \"ac\": (-1, 1000),\n", + " \"o2\": (-1000, 1000),\n", + " \"h\": (-1000, 1000),\n", + " \"h2o\": (-1000, 1000),\n", + " }\n", + " )\n", + "media_acetate_aerobic.id = 'ac/o2'\n", + "medias = [media_glucose_aerobic, media_acetate_aerobic]" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "8fdc8faf-fcc8-45cd-b775-e6bc143a42cc", + "metadata": {}, + "outputs": [], + "source": [ + "%run /home/fliu/workspace/python3/ModelSEEDpy/modelseedpy/core/msatpcorrection.py\n", + "atp_correction = MSATPCorrection(\n", + " model,\n", + " template,\n", + " medias,\n", + " atp_hydrolysis_id=\"ATPM_c0\",\n", + " load_default_medias=False,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "fc07b43d-88f5-477c-9149-28756a5cd926", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0" + "[[, 0.01],\n", + " [, 0.01]]" ] }, - "execution_count": 18, + "execution_count": 98, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "atp_correction.max_gapfilling" + "atp_correction.atp_medias" ] }, { "cell_type": "code", - "execution_count": 19, - "id": "1af1e574-76b2-40f7-82f8-4ffd1bb2c442", + "execution_count": 99, + "id": "369ef2d4-f696-4762-9370-d91276e3b95f", "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Namee_coli_core
Memory address7ff258653370
Number of metabolites72
Number of reactions91
Number of genes137
Number of groups0
Objective expression1.0*BIOMASS_Ecoli_core_w_GAM_c0 - 1.0*BIOMASS_Ecoli_core_w_GAM_c0_reverse_70c47
Compartmentsextracellular space, cytosol
" + ], "text/plain": [ - "0" + "" ] }, - "execution_count": 19, + "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "atp_correction.gapfilling_delta" + "model" ] }, { "cell_type": "code", - "execution_count": 43, - "id": "0a344084-edad-456f-9e88-064a404039d4", + "execution_count": 100, + "id": "62862b90-d73b-4597-8e3f-c8bf55e9090e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[]" + "{'glc/o2': 0.0, 'ac/o2': 0.0}" ] }, - "execution_count": 43, + "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "atp_correction.gapfilling_tests" + "atp_correction.evaluate_growth_media()" ] }, { "cell_type": "code", - "execution_count": 44, - "id": "9e78779d-b7e7-4e73-a77c-9813bee3c6a9", + "execution_count": 101, + "id": "e67db875-e06f-464c-b96c-8e4ce7eb6324", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[]" + "{: {'reversed': {},\n", + " 'new': {'GLCpts_c0': '>'},\n", + " 'media': ,\n", + " 'target': 'ATPM_c0',\n", + " 'minobjective': 0.01,\n", + " 'binary_check': False},\n", + " : {'reversed': {},\n", + " 'new': {'CYTBD_c0': '>', 'NADH16_c0': '>', 'O2t_c0': '>'},\n", + " 'media': ,\n", + " 'target': 'ATPM_c0',\n", + " 'minobjective': 0.01,\n", + " 'binary_check': False}}" ] }, - "execution_count": 44, + "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "atp_correction.gapfilling_tests" + "atp_correction.media_gapfill_stats" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "id": "47da598f-b3cd-423d-93eb-0e68f11eaef9", + "metadata": {}, + "outputs": [], + "source": [ + "atp_correction.determine_growth_media()" ] }, { "cell_type": "code", - "execution_count": 68, - "id": "669e1ddb-493b-461e-bef9-d19cb1f5e542", + "execution_count": 105, + "id": "42673388-2500-4922-83b9-3e4dfa7acb17", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[]" + "'glc/o2'" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "atp_correction.selected_media[0].id" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "id": "c0e29cc8-85d5-450e-a3d6-c1207d297963", + "metadata": {}, + "outputs": [], + "source": [ + "atp_correction.apply_growth_media_gapfilling()" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "id": "43f29d4f-30b3-452f-a5f9-49489b97d646", + "metadata": {}, + "outputs": [], + "source": [ + "media_eval = atp_correction.evaluate_growth_media()" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "id": "f8044fd4-70f1-4082-9316-e601ac06ac7e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'glc/o2': 2.75, 'ac/o2': 0.0}" ] }, - "execution_count": 68, + "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "atp_correction.gapfilling_tests" + "media_eval" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "id": "db1e8df2-4a86-408b-a479-5eebf13e9971", + "metadata": {}, + "outputs": [], + "source": [ + "atp_correction.expand_model_to_genome_scale()" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "id": "d76dcb54-1ea2-4e53-8853-521790cd8300", + "metadata": {}, + "outputs": [], + "source": [ + "tests = atp_correction.build_tests()" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "id": "f30e70fa-5258-42fd-b624-aafdce509b80", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "empty {'media': , 'is_max_threshold': True, 'threshold': 1e-05, 'objective': 'ATPM_c0'}\n", + "glc/o2 {'media': , 'is_max_threshold': True, 'threshold': 3.3, 'objective': 'ATPM_c0'}\n" + ] + } + ], + "source": [ + "for t in tests:\n", + " print(t['media'].id, t)" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "id": "c35d3047-da1f-4331-a907-765c2b43048d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'media': ,\n", + " 'is_max_threshold': True,\n", + " 'threshold': 1e-05,\n", + " 'objective': 'ATPM_c0'},\n", + " {'media': ,\n", + " 'is_max_threshold': True,\n", + " 'threshold': 3.3,\n", + " 'objective': 'ATPM_c0'}]" + ] + }, + "execution_count": 114, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tests" ] }, { "cell_type": "code", "execution_count": null, - "id": "e24d8e82-357a-4658-9362-6073f502b6bc", + "id": "7b718e1d-059d-410b-bf1a-05a734f09e0d", "metadata": {}, "outputs": [], "source": [] @@ -1298,7 +1598,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, diff --git a/examples/Model Reconstruction/Biomass.ipynb b/examples/Model Reconstruction/Biomass.ipynb index e4a2c901..3726f959 100644 --- a/examples/Model Reconstruction/Biomass.ipynb +++ b/examples/Model Reconstruction/Biomass.ipynb @@ -2,18 +2,17 @@ "cells": [ { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "id": "5434992c-fc67-40f5-ae08-82f44790666c", "metadata": {}, "outputs": [], "source": [ - "from modelseedpy.helpers import get_template\n", - "from modelseedpy.core.mstemplate import MSTemplateBuilder" + "import modelseedpy" ] }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 2, "id": "b243e00a-4a8b-489d-a778-61844a439e63", "metadata": {}, "outputs": [ @@ -21,7 +20,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "cobrakbase 0.2.8\n" + "cobrakbase 0.3.1\n" ] } ], @@ -30,6 +29,157 @@ "kbase = cobrakbase.KBaseAPI()" ] }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3a177c16-ecb0-4050-bbf5-47aad10f2af9", + "metadata": {}, + "outputs": [], + "source": [ + "template = kbase.get_from_ws('GramNegModelTemplateV3', 'NewKBaseModelTemplates')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4ce52552-dce2-4c44-9884-cf00d15e76ab", + "metadata": {}, + "outputs": [], + "source": [ + "from modelseedpy import MSBuilder" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6f216f6a-5e25-4697-bf6b-9ae63475b5c7", + "metadata": {}, + "outputs": [], + "source": [ + "from cobra.core import Model\n", + "model = Model('test')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d9763d58-daba-4751-811f-23581b390025", + "metadata": {}, + "outputs": [], + "source": [ + "biomass = template.biomasses[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d3e884ac-2568-445a-ac04-1508b536c88a", + "metadata": {}, + "outputs": [], + "source": [ + "reaction = biomass.build_biomass(model, '0', True)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f5140ac5-273f-4eb5-b806-ddd9178b252e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cpd00010_c0 {'modelseed_template_id': 'cpd00010_c'}\n", + "cpd11493_c0 {'modelseed_template_id': 'cpd11493_c'}\n", + "cpd12370_c0 {'modelseed_template_id': 'cpd12370_c'}\n", + "cpd00003_c0 {'modelseed_template_id': 'cpd00003_c'}\n", + "cpd00006_c0 {'modelseed_template_id': 'cpd00006_c'}\n", + "cpd00205_c0 {'modelseed_template_id': 'cpd00205_c'}\n", + "cpd00254_c0 {'modelseed_template_id': 'cpd00254_c'}\n", + "cpd10516_c0 {'modelseed_template_id': 'cpd10516_c'}\n", + "cpd00063_c0 {'modelseed_template_id': 'cpd00063_c'}\n", + "cpd00009_c0 {'modelseed_template_id': 'cpd00009_c'}\n", + "cpd00099_c0 {'modelseed_template_id': 'cpd00099_c'}\n", + "cpd00149_c0 {'modelseed_template_id': 'cpd00149_c'}\n", + "cpd00058_c0 {'modelseed_template_id': 'cpd00058_c'}\n", + "cpd00015_c0 {'modelseed_template_id': 'cpd00015_c'}\n", + "cpd10515_c0 {'modelseed_template_id': 'cpd10515_c'}\n", + "cpd00030_c0 {'modelseed_template_id': 'cpd00030_c'}\n", + "cpd00048_c0 {'modelseed_template_id': 'cpd00048_c'}\n", + "cpd00034_c0 {'modelseed_template_id': 'cpd00034_c'}\n", + "cpd00016_c0 {'modelseed_template_id': 'cpd00016_c'}\n", + "cpd00220_c0 {'modelseed_template_id': 'cpd00220_c'}\n", + "cpd00017_c0 {'modelseed_template_id': 'cpd00017_c'}\n", + "cpd00201_c0 {'modelseed_template_id': 'cpd00201_c'}\n", + "cpd00087_c0 {'modelseed_template_id': 'cpd00087_c'}\n", + "cpd00345_c0 {'modelseed_template_id': 'cpd00345_c'}\n", + "cpd00042_c0 {'modelseed_template_id': 'cpd00042_c'}\n", + "cpd00028_c0 {'modelseed_template_id': 'cpd00028_c'}\n", + "cpd00557_c0 {'modelseed_template_id': 'cpd00557_c'}\n", + "cpd00264_c0 {'modelseed_template_id': 'cpd00264_c'}\n", + "cpd00118_c0 {'modelseed_template_id': 'cpd00118_c'}\n", + "cpd00056_c0 {'modelseed_template_id': 'cpd00056_c'}\n", + "cpd15560_c0 {'modelseed_template_id': 'cpd15560_c'}\n", + "cpd15352_c0 {'modelseed_template_id': 'cpd15352_c'}\n", + "cpd15500_c0 {'modelseed_template_id': 'cpd15500_c'}\n", + "cpd00166_c0 {'modelseed_template_id': 'cpd00166_c'}\n", + "cpd01997_c0 {'modelseed_template_id': 'cpd01997_c'}\n", + "cpd03422_c0 {'modelseed_template_id': 'cpd03422_c'}\n", + "cpd00104_c0 {'modelseed_template_id': 'cpd00104_c'}\n", + "cpd00037_c0 {'modelseed_template_id': 'cpd00037_c'}\n", + "cpd00050_c0 {'modelseed_template_id': 'cpd00050_c'}\n", + "cpd15793_c0 {'modelseed_template_id': 'cpd15793_c'}\n", + "cpd15540_c0 {'modelseed_template_id': 'cpd15540_c'}\n", + "cpd15533_c0 {'modelseed_template_id': 'cpd15533_c'}\n", + "cpd15432_c0 {'modelseed_template_id': 'cpd15432_c'}\n", + "cpd02229_c0 {'modelseed_template_id': 'cpd02229_c'}\n", + "cpd15665_c0 {'modelseed_template_id': 'cpd15665_c'}\n", + "cpd15666_c0 {'modelseed_template_id': 'cpd15666_c'}\n", + "cpd00023_c0 {'modelseed_template_id': 'cpd00023_c'}\n", + "cpd00001_c0 {'modelseed_template_id': 'cpd00001_c'}\n", + "cpd00033_c0 {'modelseed_template_id': 'cpd00033_c'}\n", + "cpd00035_c0 {'modelseed_template_id': 'cpd00035_c'}\n", + "cpd00039_c0 {'modelseed_template_id': 'cpd00039_c'}\n", + "cpd00041_c0 {'modelseed_template_id': 'cpd00041_c'}\n", + "cpd00051_c0 {'modelseed_template_id': 'cpd00051_c'}\n", + "cpd00053_c0 {'modelseed_template_id': 'cpd00053_c'}\n", + "cpd00054_c0 {'modelseed_template_id': 'cpd00054_c'}\n", + "cpd00060_c0 {'modelseed_template_id': 'cpd00060_c'}\n", + "cpd00065_c0 {'modelseed_template_id': 'cpd00065_c'}\n", + "cpd00066_c0 {'modelseed_template_id': 'cpd00066_c'}\n", + "cpd00069_c0 {'modelseed_template_id': 'cpd00069_c'}\n", + "cpd00084_c0 {'modelseed_template_id': 'cpd00084_c'}\n", + "cpd00107_c0 {'modelseed_template_id': 'cpd00107_c'}\n", + "cpd00119_c0 {'modelseed_template_id': 'cpd00119_c'}\n", + "cpd00129_c0 {'modelseed_template_id': 'cpd00129_c'}\n", + "cpd00132_c0 {'modelseed_template_id': 'cpd00132_c'}\n", + "cpd00156_c0 {'modelseed_template_id': 'cpd00156_c'}\n", + "cpd00161_c0 {'modelseed_template_id': 'cpd00161_c'}\n", + "cpd00322_c0 {'modelseed_template_id': 'cpd00322_c'}\n", + "cpd00115_c0 {'modelseed_template_id': 'cpd00115_c'}\n", + "cpd00012_c0 {'modelseed_template_id': 'cpd00012_c'}\n", + "cpd00241_c0 {'modelseed_template_id': 'cpd00241_c'}\n", + "cpd00356_c0 {'modelseed_template_id': 'cpd00356_c'}\n", + "cpd00357_c0 {'modelseed_template_id': 'cpd00357_c'}\n", + "cpd00002_c0 {'modelseed_template_id': 'cpd00002_c'}\n", + "cpd00038_c0 {'modelseed_template_id': 'cpd00038_c'}\n", + "cpd00052_c0 {'modelseed_template_id': 'cpd00052_c'}\n", + "cpd00062_c0 {'modelseed_template_id': 'cpd00062_c'}\n", + "cpd00008_c0 {'modelseed_template_id': 'cpd00008_c'}\n", + "cpd00067_c0 {'modelseed_template_id': 'cpd00067_c'}\n", + "cpd11416_c0 {'modelseed_template_id': 'cpd11416_c'}\n", + "cpd17041_c0 {'modelseed_template_id': 'cpd17041_c'}\n", + "cpd17042_c0 {'modelseed_template_id': 'cpd17042_c'}\n", + "cpd17043_c0 {'modelseed_template_id': 'cpd17043_c'}\n" + ] + } + ], + "source": [ + "for m in reaction.metabolites:\n", + " print(m, m.notes)" + ] + }, { "cell_type": "code", "execution_count": 42, @@ -551,7 +701,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, diff --git a/examples/Model Reconstruction/Gapfilling.ipynb b/examples/Model Reconstruction/Gapfilling.ipynb index eea0c536..88eadaa6 100644 --- a/examples/Model Reconstruction/Gapfilling.ipynb +++ b/examples/Model Reconstruction/Gapfilling.ipynb @@ -2,17 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "cobrakbase 0.2.8\n" - ] - } - ], + "outputs": [], "source": [ "import cobra\n", "#If you have CPLEX, uncomment this\n", @@ -20,31 +12,37 @@ "import cobrakbase\n", "#import modelseedpy.fbapkg\n", "from modelseedpy import GapfillingPkg, KBaseMediaPkg\n", - "from modelseedpy import FBAHelper, MSBuilder" + "from modelseedpy import FBAHelper, MSBuilder\n", + "kbase_api = cobrakbase.KBaseAPI()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "model = kbase_api.get_from_ws(\"test_model\",18528)" + ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - }, "tags": [] }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:modelseedpy.core.msmodelutl:cpd00244 not found in model!\n" + ] + }, { "data": { "text/html": [ - "

Objective

1.0 bio1 = 0.8048653841131165

Uptake

\n", + "

Objective

1.0 bio1 = 0.7997546667881398

Uptake

\n", " \n", " \n", " \n", @@ -58,14 +56,14 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -79,98 +77,98 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -189,28 +187,35 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -218,19 +223,15 @@ "
Metabolite
cpd00009_e0EX_cpd00009_e00.99980.993400.00%
cpd00013_e0EX_cpd00013_e06.0376.09400.00%
cpd00030_e0EX_cpd00030_e00.006390.0063500.00%
cpd00034_e0EX_cpd00034_e00.006390.0063500.00%
cpd00048_e0EX_cpd00048_e00.17550.174400.00%
cpd00058_e0EX_cpd00058_e00.006390.0063500.00%
cpd00063_e0EX_cpd00063_e00.006390.0063500.00%
cpd00067_e0EX_cpd00067_e061.8561.4300.00%
cpd00099_e0EX_cpd00099_e00.006390.0063500.00%
cpd00149_e0EX_cpd00149_e00.006390.0063500.00%
cpd00205_e0EX_cpd00205_e00.006390.0063500.00%
cpd00254_e0EX_cpd00254_e00.006390.0063500.00%
cpd10516_e0EX_cpd10516_e00.025560.025400.00%
cpd17041_c0rxn13782_c00.80490.799800.00%
cpd17042_c0rxn13783_c00.80490.799800.00%
cpd17043_c0rxn13784_c00.80490.799800.00%
cpd00001_e0EX_cpd00001_e0-82.26-81.9500.00%
cpd00007_e0EX_cpd00007_e0-2.928-2.86900.00%
cpd15378_e0EX_cpd15378_e0-0.00639-0.006357100.00%18.92%
cpd03091_c0SK_cpd03091_c0-0.019051081.08%
cpd11416_c0SK_cpd11416_c0-0.8049-0.799800.00%
" ], "text/plain": [ - "" + "" ] }, - "execution_count": 2, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "kbase_api = cobrakbase.KBaseAPI()\n", - "model = kbase_api.get_from_ws(\"test_model\",18528)\n", - "#If you have CPLEX, uncomment this\n", - "#model.solver = 'optlang-cplex'\n", "template = kbase_api.get_from_ws(\"GramNegModelTemplateV3\",\"NewKBaseModelTemplates\")\n", "media = kbase_api.get_from_ws(\"Carbon-D-Glucose\",\"KBaseMedia\")\n", "model = MSBuilder.gapfill_model(model,\"bio1\",template,media)\n", @@ -17910,7 +17911,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, diff --git a/examples/Model Reconstruction/Genomes.ipynb b/examples/Model Reconstruction/Genomes.ipynb index 60270468..8ea82ef4 100644 --- a/examples/Model Reconstruction/Genomes.ipynb +++ b/examples/Model Reconstruction/Genomes.ipynb @@ -1,223 +1,300 @@ { "cells": [ { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "cell_type": "markdown", + "metadata": { + "tags": [] + }, "source": [ - "import modelseedpy\n", - "from modelseedpy.core.msgenome import MSGenome\n", - "from modelseedpy.core.rast_client import RastClient" + "### Genomes\n", + "\n", + "ModelSEEDpy provides its own genome object type `modelseedpy.core.msgenome.MSGenome` to manipulate genomes" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "genome = MS" + "import modelseedpy\n", + "from modelseedpy.core.msgenome import MSGenome" ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "cell_type": "markdown", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, "source": [ - "1" + "#### Reading faa file\n", + "\n", + "To load a genome we can read a `.faa` file that contains protein sequences" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "genome = MSGenome.from_fasta('GCF_000005845.2_ASM584v2_protein.faa', split=' ')" + ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "rast = RastClient()" + "genome" ] }, { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], + "cell_type": "markdown", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, "source": [ - "genome = MSGenome.from_fasta('GCF_000005845.2.faa', split=' ')" + "#### Manipulating genes\n", + "\n", + "Each gene is stored as a `modelseedpy.core.msgenome.MSFeature` in the `.features` of type `cobra.core.dictlist.DictList` similiar to the cobrapy `.reactions` and `.metabolites` in the `cobra.core.Model`" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 4, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of features: 3\n" - ] + "data": { + "text/plain": [ + "4285" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "print('Number of features:', len(genome.features))" + "len(genome.features)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "for f in genome.features:\n", - " print(f.id, len(f.seq), f.description)" + "gene = genome.features.get_by_id('NP_414542.1')\n", + "gene" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 14, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[{'execution_time': 1622756127.36331,\n", - " 'tool_name': 'kmer_search',\n", - " 'hostname': 'pear',\n", - " 'parameters': ['-a',\n", - " '-g',\n", - " 200,\n", - " '-m',\n", - " 5,\n", - " '-d',\n", - " '/opt/patric-common/data/kmer_metadata_v2',\n", - " '-u',\n", - " 'http://pear.mcs.anl.gov:6100/query'],\n", - " 'id': '9CCA6D20-C4B3-11EB-A893-36A8BEF382BD'},\n", - " {'parameters': ['annotate_hypothetical_only=1',\n", - " 'dataset_name=Release70',\n", - " 'kmer_size=8'],\n", - " 'hostname': 'pear',\n", - " 'tool_name': 'KmerAnnotationByFigfam',\n", - " 'id': '9CE3769E-C4B3-11EB-A893-36A8BEF382BD',\n", - " 'execution_time': 1622756127.52738},\n", - " {'execute_time': 1622756127.88296,\n", - " 'hostname': 'pear',\n", - " 'parameters': [],\n", - " 'tool_name': 'annotate_proteins_similarity',\n", - " 'id': '9D19B7EA-C4B3-11EB-9714-71B3BDF382BD'}]" + "modelseedpy.core.msgenome.MSFeature" ] }, - "execution_count": 14, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "rast.annotate_genome(genome)" + "type(gene)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Equivalent call from the client it self" + "##### Gene annotation\n", + "Annotation is store as an **ontology term**. When loading from a `.faa` file no ontology term is present but we can add them later." ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#genome, res = rast.annotate_genome_from_fasta('GCF_000005845.2_ASM584v2_protein.faa', split=' ')\n", - "#res" + "gene.ontology_terms" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "'thr operon leader peptide [Escherichia coli str. K-12 substr. MG1655]'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gene.description" + ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 9, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "{'annotation': ['thr operon leader peptide [Escherichia coli str. K-12 substr. MG1655]']}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gene.add_ontology_term('annotation', gene.description)\n", + "gene.ontology_terms" + ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "cell_type": "markdown", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, + "source": [ + "#### RAST\n", + "It is possible to annotate genomes with RAST by calling the `RastClient`" + ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "from modelseedpy.core.rast_client import RastClient\n", + "rast = RastClient()" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "[{'id': 'C54F08A4-CDB3-11ED-A7E9-CAF09D6086F0',\n", + " 'parameters': ['-a',\n", + " '-g',\n", + " 200,\n", + " '-m',\n", + " 5,\n", + " '-d',\n", + " '/opt/patric-common/data/kmer_metadata_v2',\n", + " '-u',\n", + " 'http://pear.mcs.anl.gov:6100/query'],\n", + " 'hostname': 'pear',\n", + " 'tool_name': 'kmer_search',\n", + " 'execution_time': 1680040751.14837},\n", + " {'id': 'C5638324-CDB3-11ED-A7E9-CAF09D6086F0',\n", + " 'parameters': ['annotate_hypothetical_only=1',\n", + " 'dataset_name=Release70',\n", + " 'kmer_size=8'],\n", + " 'tool_name': 'KmerAnnotationByFigfam',\n", + " 'hostname': 'pear',\n", + " 'execution_time': 1680040751.28257},\n", + " {'parameters': [],\n", + " 'id': 'C5944E1E-CDB3-11ED-8217-51F29F6086F0',\n", + " 'execute_time': 1680040751.60236,\n", + " 'tool_name': 'annotate_proteins_similarity',\n", + " 'hostname': 'pear'}]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rast.annotate_genome(genome)" + ] }, { - "cell_type": "code", - "execution_count": 34, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "feature = genome.features.get_by_id('YP_588478.1')" + "RAST annotation is stored in the ontology term **RAST** and this is used as default to build metabolic models with the ModelSEED templates" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'RAST': 'DUF1435 domain-containing protein YjjZ [Escherichia coli str. K-12 substr. MG1655]'}" + "{'annotation': ['thr operon leader peptide [Escherichia coli str. K-12 substr. MG1655]'],\n", + " 'RAST': ['Thr operon leader peptide']}" ] }, - "execution_count": 36, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "feature.ontology_terms" + "gene.ontology_terms" ] }, { @@ -225,14 +302,12 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "feature.add_ontology_term('')" - ] + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -246,7 +321,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.8.10" } }, "nbformat": 4, diff --git a/examples/Model Reconstruction/build_metabolic_model.ipynb b/examples/Model Reconstruction/build_metabolic_model.ipynb index 2f1e8d3f..ea2e8d41 100644 --- a/examples/Model Reconstruction/build_metabolic_model.ipynb +++ b/examples/Model Reconstruction/build_metabolic_model.ipynb @@ -1,12 +1,26 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Build Metabolic Model from Genome .faa file" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* MSGenome: to read a faa file\n", + "* MSBuilder: to build metabolic model from the genome" + ] + }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "import modelseedpy\n", "from modelseedpy import MSBuilder, MSGenome" ] }, @@ -19,21 +33,1446 @@ "genome = MSGenome.from_fasta('GCF_000005845.2_ASM584v2_protein.faa', split=' ')" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`MSBuilder.build_metabolic_model` default parameters runs RAST, ML prediction to select template (gram neg, gram pos, cyano [not implemented], archaea [not implemented]), builds draft model and gapfills with complete media" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "type object argument after ** must be a mapping, not str", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/ipykernel_3118582/859642788.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mmodelseedpy\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mRastClient\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mrast\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mRastClient\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mrast\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mannotate_genome\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgenome\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/modelseedpy/core/rast_client.py\u001b[0m in \u001b[0;36mannotate_genome\u001b[0;34m(self, genome)\u001b[0m\n\u001b[1;32m 68\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseq\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 69\u001b[0m \u001b[0mp_features\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"id\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"protein_translation\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 70\u001b[0;31m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp_features\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 71\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mo\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"features\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/modelseedpy/core/rast_client.py\u001b[0m in \u001b[0;36mf\u001b[0;34m(self, p_features)\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mp_features\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 92\u001b[0m \u001b[0mparams\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"features\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mp_features\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m\"stages\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstages\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 93\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrpc_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"GenomeAnnotation.run_pipeline\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 94\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/modelseedpy/core/rpcclient.py\u001b[0m in \u001b[0;36mcall\u001b[0;34m(self, method, params, token)\u001b[0m\n\u001b[1;32m 73\u001b[0m \u001b[0merr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mret\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 74\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"error\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 75\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mServerError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0merr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"error\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 76\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 77\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mServerError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Unknown\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mret\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mTypeError\u001b[0m: type object argument after ** must be a mapping, not str" + ] + } + ], + "source": [ + "from modelseedpy import RastClient\n", + "rast = RastClient()\n", + "rast.annotate_genome(genome)" + ] + }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "Number of features: 4285\n" + "/home/fliu/.local/lib/python3.8/site-packages/cobra/io/dict.py:89: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " if isinstance(value, np.float):\n", + "/home/fliu/.local/lib/python3.8/site-packages/cobra/io/dict.py:91: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " if isinstance(value, np.bool):\n" ] } ], "source": [ - "print('Number of features:', len(genome.features))" + "model = MSBuilder.build_metabolic_model('ecoli', genome, classic_biomass=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Objective

1.0 bio1 = 141.02637369025626

Uptake

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MetaboliteReactionFluxC-NumberC-Flux
cpd00007_e0EX_cpd00007_e0244.300.00%
cpd00024_e0EX_cpd00024_e083.0752.58%
cpd00028_e0EX_cpd00028_e00.3955340.08%
cpd00030_e0EX_cpd00030_e00.395500.00%
cpd00033_e0EX_cpd00033_e079.8120.99%
cpd00034_e0EX_cpd00034_e00.395500.00%
cpd00039_e0EX_cpd00039_e031.4261.17%
cpd00051_e0EX_cpd00051_e034.7461.29%
cpd00054_e0EX_cpd00054_e034.3530.64%
cpd00058_e0EX_cpd00058_e00.395500.00%
cpd00060_e0EX_cpd00060_e031.0950.96%
cpd00063_e0EX_cpd00063_e00.395500.00%
cpd00065_e0EX_cpd00065_e06.647110.45%
cpd00066_e0EX_cpd00066_e021.7691.21%
cpd00069_e0EX_cpd00069_e016.9990.95%
cpd00079_e0EX_cpd00079_e0499.9618.61%
cpd00080_e0EX_cpd00080_e0609.4311.34%
cpd00099_e0EX_cpd00099_e00.395500.00%
cpd00106_e0EX_cpd00106_e0401.249.96%
cpd00107_e0EX_cpd00107_e052.8661.97%
cpd00118_e0EX_cpd00118_e00.395540.01%
cpd00119_e0EX_cpd00119_e011.1660.42%
cpd00129_e0EX_cpd00129_e025.9650.81%
cpd00130_e0EX_cpd00130_e0199.144.94%
cpd00132_e0EX_cpd00132_e028.2840.70%
cpd00136_e0EX_cpd00136_e00.395570.02%
cpd00149_e0EX_cpd00149_e00.395500.00%
cpd00156_e0EX_cpd00156_e049.651.54%
cpd00161_e0EX_cpd00161_e029.7240.74%
cpd00184_e0EX_cpd00184_e0221.11013.71%
cpd00205_e0EX_cpd00205_e00.395500.00%
cpd00208_e0EX_cpd00208_e03.526120.26%
cpd00209_e0EX_cpd00209_e019000.00%
cpd00249_e0EX_cpd00249_e011.5690.65%
cpd00254_e0EX_cpd00254_e00.395500.00%
cpd00264_e0EX_cpd00264_e00.395570.02%
cpd00268_e0EX_cpd00268_e00.197800.00%
cpd00277_e0EX_cpd00277_e022.59101.40%
cpd00305_e0EX_cpd00305_e00.3955120.03%
cpd00322_e0EX_cpd00322_e034.0561.27%
cpd00355_e0EX_cpd00355_e00.791110.05%
cpd00367_e0EX_cpd00367_e012.9990.73%
cpd00383_e0EX_cpd00383_e01.97870.09%
cpd00412_e0EX_cpd00412_e02.76990.15%
cpd00438_e0EX_cpd00438_e02411014.95%
cpd00644_e0EX_cpd00644_e00.79190.04%
cpd00794_e0EX_cpd00794_e014.1121.05%
cpd01080_e0EX_cpd01080_e035.09183.92%
cpd03847_e0EX_cpd03847_e03.526140.31%
cpd10515_e0EX_cpd10515_e00.79100.00%
cpd10516_e0EX_cpd10516_e00.395500.00%
cpd17041_c0rxn13782_c014100.00%
cpd17042_c0rxn13783_c014100.00%
cpd17043_c0rxn13784_c014100.00%

Secretion

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MetaboliteReactionFluxC-NumberC-Flux
cpd00009_e0EX_cpd00009_e0-100000.00%
cpd00011_e0EX_cpd00011_e0-796.817.50%
cpd00020_e0EX_cpd00020_e0-282.137.97%
cpd00027_e0EX_cpd00027_e0-445.8625.18%
cpd00029_e0EX_cpd00029_e0-49029.22%
cpd00035_e0EX_cpd00035_e0-185.235.23%
cpd00047_e0EX_cpd00047_e0-2.37310.02%
cpd00100_e0EX_cpd00100_e0-4.38630.12%
cpd00108_e0EX_cpd00108_e0-3.52660.20%
cpd00116_e0EX_cpd00116_e0-0.395510.00%
cpd00139_e0EX_cpd00139_e0-1.18720.02%
cpd00151_e0EX_cpd00151_e0-221.1510.40%
cpd00159_e0EX_cpd00159_e0-835.5323.60%
cpd00226_e0EX_cpd00226_e0-220.8510.39%
cpd02701_c0SK_cpd02701_c0-0.3955150.06%
cpd03091_c0SK_cpd03091_c0-0.791100.07%
cpd11416_c0SK_cpd11416_c0-14100.00%
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Ignore this below ..." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from modelseedpy import RastClient\n", + "rast = RastClient()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Objective

1.0 bio1 = 141.02637369025626

Uptake

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MetaboliteReactionFluxC-NumberC-Flux
cpd00007_e0EX_cpd00007_e0244.300.00%
cpd00024_e0EX_cpd00024_e083.0752.58%
cpd00028_e0EX_cpd00028_e00.3955340.08%
cpd00030_e0EX_cpd00030_e00.395500.00%
cpd00033_e0EX_cpd00033_e079.8120.99%
cpd00034_e0EX_cpd00034_e00.395500.00%
cpd00039_e0EX_cpd00039_e031.4261.17%
cpd00051_e0EX_cpd00051_e034.7461.29%
cpd00054_e0EX_cpd00054_e034.3530.64%
cpd00058_e0EX_cpd00058_e00.395500.00%
cpd00060_e0EX_cpd00060_e031.0950.96%
cpd00063_e0EX_cpd00063_e00.395500.00%
cpd00065_e0EX_cpd00065_e06.647110.45%
cpd00066_e0EX_cpd00066_e021.7691.21%
cpd00069_e0EX_cpd00069_e016.9990.95%
cpd00079_e0EX_cpd00079_e0499.9618.61%
cpd00080_e0EX_cpd00080_e0609.4311.34%
cpd00099_e0EX_cpd00099_e00.395500.00%
cpd00106_e0EX_cpd00106_e0401.249.96%
cpd00107_e0EX_cpd00107_e052.8661.97%
cpd00118_e0EX_cpd00118_e00.395540.01%
cpd00119_e0EX_cpd00119_e011.1660.42%
cpd00129_e0EX_cpd00129_e025.9650.81%
cpd00130_e0EX_cpd00130_e0199.144.94%
cpd00132_e0EX_cpd00132_e028.2840.70%
cpd00136_e0EX_cpd00136_e00.395570.02%
cpd00149_e0EX_cpd00149_e00.395500.00%
cpd00156_e0EX_cpd00156_e049.651.54%
cpd00161_e0EX_cpd00161_e029.7240.74%
cpd00184_e0EX_cpd00184_e0221.11013.71%
cpd00205_e0EX_cpd00205_e00.395500.00%
cpd00208_e0EX_cpd00208_e03.526120.26%
cpd00209_e0EX_cpd00209_e019000.00%
cpd00249_e0EX_cpd00249_e011.5690.65%
cpd00254_e0EX_cpd00254_e00.395500.00%
cpd00264_e0EX_cpd00264_e00.395570.02%
cpd00268_e0EX_cpd00268_e00.197800.00%
cpd00277_e0EX_cpd00277_e022.59101.40%
cpd00305_e0EX_cpd00305_e00.3955120.03%
cpd00322_e0EX_cpd00322_e034.0561.27%
cpd00355_e0EX_cpd00355_e00.791110.05%
cpd00367_e0EX_cpd00367_e012.9990.73%
cpd00383_e0EX_cpd00383_e01.97870.09%
cpd00412_e0EX_cpd00412_e02.76990.15%
cpd00438_e0EX_cpd00438_e02411014.95%
cpd00644_e0EX_cpd00644_e00.79190.04%
cpd00794_e0EX_cpd00794_e014.1121.05%
cpd01080_e0EX_cpd01080_e035.09183.92%
cpd03847_e0EX_cpd03847_e03.526140.31%
cpd10515_e0EX_cpd10515_e00.79100.00%
cpd10516_e0EX_cpd10516_e00.395500.00%
cpd17041_c0rxn13782_c014100.00%
cpd17042_c0rxn13783_c014100.00%
cpd17043_c0rxn13784_c014100.00%

Secretion

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MetaboliteReactionFluxC-NumberC-Flux
cpd00009_e0EX_cpd00009_e0-100000.00%
cpd00011_e0EX_cpd00011_e0-796.817.50%
cpd00020_e0EX_cpd00020_e0-282.137.97%
cpd00027_e0EX_cpd00027_e0-445.8625.18%
cpd00029_e0EX_cpd00029_e0-49029.22%
cpd00035_e0EX_cpd00035_e0-185.235.23%
cpd00047_e0EX_cpd00047_e0-2.37310.02%
cpd00100_e0EX_cpd00100_e0-4.38630.12%
cpd00108_e0EX_cpd00108_e0-3.52660.20%
cpd00116_e0EX_cpd00116_e0-0.395510.00%
cpd00139_e0EX_cpd00139_e0-1.18720.02%
cpd00151_e0EX_cpd00151_e0-221.1510.40%
cpd00159_e0EX_cpd00159_e0-835.5323.60%
cpd00226_e0EX_cpd00226_e0-220.8510.39%
cpd02701_c0SK_cpd02701_c0-0.3955150.06%
cpd03091_c0SK_cpd03091_c0-0.791100.07%
cpd11416_c0SK_cpd11416_c0-14100.00%
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of features: 4285\n" + ] + } + ], + "source": [ + "print('Number of features:', len(genome.features))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "builder = MSBuilder(genome)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "ename": "IndexError", + "evalue": "The genomes or genomeSet that you have submitted wasn’t annotated using the RAST annotation pipeline. Please annotate the genomes via ‘Annotate Microbial Genome’ app (https://narrative.kbase.us/#appcatalog/app/RAST_SDK/reannotate_microbial_genome/release)or genomeSets via Annotate Multiple Microbial Genomes’ app (https://narrative.kbase.us/#appcatalog/app/RAST_SDK/reannotate_microbial_genomes/release) and resubmit the RAST annotated genome/genomeSets into the Predict Phenotype app. (", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/modelseedpy/ml/predict_phenotype.py\u001b[0m in \u001b[0;36mcreate_indicator_matrix\u001b[0;34m(ref_to_role, master_role_list)\u001b[0m\n\u001b[1;32m 93\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 94\u001b[0;31m \u001b[0mindicators\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmatching_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 95\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mIndexError\u001b[0m: arrays used as indices must be of integer (or boolean) type", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/ipykernel_3016957/3197840996.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mbuilder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauto_select_template\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/modelseedpy/core/msbuilder.py\u001b[0m in \u001b[0;36mauto_select_template\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 664\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 665\u001b[0m \u001b[0mgenome_classifier\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_classifier\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"knn_ACNP_RAST_filter_01_17_2023\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 666\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgenome_class\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgenome_classifier\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclassify\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgenome\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 667\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 668\u001b[0m \u001b[0;31m# TODO: update with enum MSGenomeClass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/modelseedpy/core/msgenomeclassifier.py\u001b[0m in \u001b[0;36mclassify\u001b[0;34m(self, genome_or_roles, ontology_term)\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0mgenome_or_roles\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0montology_term\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 32\u001b[0m )\n\u001b[0;32m---> 33\u001b[0;31m indicator_df, master_role_list = create_indicator_matrix(\n\u001b[0m\u001b[1;32m 34\u001b[0m \u001b[0mgenome_or_roles\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 35\u001b[0m )\n", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/modelseedpy/ml/predict_phenotype.py\u001b[0m in \u001b[0;36mcreate_indicator_matrix\u001b[0;34m(ref_to_role, master_role_list)\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[0mindicators\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmatching_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 96\u001b[0;31m raise IndexError(\n\u001b[0m\u001b[1;32m 97\u001b[0m \u001b[0;31m\"\u001b[0m\u001b[0mThe\u001b[0m \u001b[0mgenomes\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mgenomeSet\u001b[0m \u001b[0mthat\u001b[0m \u001b[0myou\u001b[0m \u001b[0mhave\u001b[0m \u001b[0msubmitted\u001b[0m \u001b[0mwasn\u001b[0m\u001b[0;31m’\u001b[0m\u001b[0mt\u001b[0m \u001b[0mannotated\u001b[0m \u001b[0musing\u001b[0m \u001b[0mthe\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[0mRAST\u001b[0m \u001b[0mannotation\u001b[0m \u001b[0mpipeline\u001b[0m\u001b[0;34m.\u001b[0m \u001b[0mPlease\u001b[0m \u001b[0mannotate\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mgenomes\u001b[0m \u001b[0mvia\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m‘\u001b[0m\u001b[0mAnnotate\u001b[0m \u001b[0mMicrobial\u001b[0m \u001b[0mGenome\u001b[0m\u001b[0;31m’\u001b[0m \u001b[0mapp\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mIndexError\u001b[0m: The genomes or genomeSet that you have submitted wasn’t annotated using the RAST annotation pipeline. Please annotate the genomes via ‘Annotate Microbial Genome’ app (https://narrative.kbase.us/#appcatalog/app/RAST_SDK/reannotate_microbial_genome/release)or genomeSets via Annotate Multiple Microbial Genomes’ app (https://narrative.kbase.us/#appcatalog/app/RAST_SDK/reannotate_microbial_genomes/release) and resubmit the RAST annotated genome/genomeSets into the Predict Phenotype app. (" + ] + } + ], + "source": [ + "builder.auto_select_template()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from cobra.core import Reaction" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "rxn = Reaction('SK_cpd11416_c0', 'SK_cpd11416_c0', '', 0, 1000)\n", + "rxn.add_metabolites({model.metabolites.cpd11416_c0: -1})\n", + "model.add_reactions([rxn])" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/fliu/.local/lib/python3.8/site-packages/cobra/io/dict.py:89: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " if isinstance(value, np.float):\n", + "/home/fliu/.local/lib/python3.8/site-packages/cobra/io/dict.py:91: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " if isinstance(value, np.bool):\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Nameecoli
Memory address7f3dd51e8400
Number of metabolites1458
Number of reactions1772
Number of genes1295
Number of groups1323
Objective expression1.0*bio1 - 1.0*bio1_reverse_b18f7
CompartmentsCytosol, Extracellular
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "MSBuilder.gapfill_model(model, \"bio1\", builder.template, None)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Objective

1.0 bio1 = 0.0

Uptake

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MetaboliteReactionFluxC-NumberC-Flux

Secretion

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MetaboliteReactionFluxC-NumberC-Flux
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cpd00010_c0 CoA [c0] 80\n", + "cpd11493_c0 ACP [c0] 39\n", + "cpd12370_c0 apo-ACP [c0] 3\n", + "cpd00003_c0 NAD [c0] 127\n", + "cpd00006_c0 NADP [c0] 89\n", + "cpd00205_c0 K+ [c0] 5\n", + "cpd00254_c0 Mg [c0] 3\n", + "cpd10516_c0 fe3 [c0] 5\n", + "cpd00063_c0 Ca2+ [c0] 2\n", + "cpd00009_c0 Phosphate [c0] 210\n", + "cpd00099_c0 Cl- [c0] 3\n", + "cpd00149_c0 Co2+ [c0] 2\n", + "cpd00058_c0 Cu2+ [c0] 3\n", + "cpd00015_c0 FAD [c0] 13\n", + "cpd10515_c0 Fe2+ [c0] 5\n", + "cpd00030_c0 Mn2+ [c0] 2\n", + "cpd00048_c0 Sulfate [c0] 4\n", + "cpd00034_c0 Zn2+ [c0] 2\n", + "cpd00016_c0 Pyridoxal phosphate [c0] 5\n", + "cpd00220_c0 Riboflavin [c0] 5\n", + "cpd00017_c0 S-Adenosyl-L-methionine [c0] 21\n", + "cpd00201_c0 10-Formyltetrahydrofolate [c0] 7\n", + "cpd00087_c0 Tetrahydrofolate [c0] 12\n", + "cpd00345_c0 5-Methyltetrahydrofolate [c0] 3\n", + "cpd00042_c0 GSH [c0] 13\n", + "cpd00028_c0 Heme [c0] 4\n", + "cpd00557_c0 Siroheme [c0] 2\n", + "cpd00264_c0 Spermidine [c0] 8\n", + "cpd00118_c0 Putrescine [c0] 9\n", + "cpd00056_c0 TPP [c0] 7\n", + "cpd15560_c0 Ubiquinone-8 [c0] 18\n", + "cpd15352_c0 2-Demethylmenaquinone 8 [c0] 7\n", + "cpd15500_c0 Menaquinone 8 [c0] 12\n", + "cpd00166_c0 Calomide [c0] 4\n", + "cpd01997_c0 Dimethylbenzimidazole [c0] 2\n", + "cpd03422_c0 Cobinamide [c0] 2\n", + "cpd00104_c0 BIOT [c0] 5\n", + "cpd00037_c0 UDP-N-acetylglucosamine [c0] 16\n", + "cpd00050_c0 FMN [c0] 11\n", + "cpd15793_c0 Stearoylcardiolipin (B. subtilis) [c0] 1\n", + "cpd15540_c0 Phosphatidylglycerol dioctadecanoyl [c0] 3\n", + "cpd15533_c0 phosphatidylethanolamine dioctadecanoyl [c0] 3\n", + "cpd15432_c0 core oligosaccharide lipid A [c0] 2\n", + "cpd02229_c0 Bactoprenyl diphosphate [c0] 5\n", + "cpd15665_c0 Peptidoglycan polymer (n subunits) [c0] 2\n", + "cpd15666_c0 Peptidoglycan polymer (n-1 subunits) [c0] 2\n", + "cpd00023_c0 L-Glutamate [c0] 57\n", + "cpd00001_c0 H2O [c0] 556\n", + "cpd00033_c0 Glycine [c0] 21\n", + "cpd00035_c0 L-Alanine [c0] 17\n", + "cpd00039_c0 L-Lysine [c0] 8\n", + "cpd00041_c0 L-Aspartate [c0] 19\n", + "cpd00051_c0 L-Arginine [c0] 6\n", + "cpd00053_c0 L-Glutamine [c0] 17\n", + "cpd00054_c0 L-Serine [c0] 23\n", + "cpd00060_c0 L-Methionine [c0] 19\n", + "cpd00065_c0 L-Tryptophan [c0] 5\n", + "cpd00066_c0 L-Phenylalanine [c0] 4\n", + "cpd00069_c0 L-Tyrosine [c0] 6\n", + "cpd00084_c0 L-Cysteine [c0] 14\n", + "cpd00107_c0 L-Leucine [c0] 6\n", + "cpd00119_c0 L-Histidine [c0] 4\n", + "cpd00129_c0 L-Proline [c0] 11\n", + "cpd00132_c0 L-Asparagine [c0] 6\n", + "cpd00156_c0 L-Valine [c0] 5\n", + "cpd00161_c0 L-Threonine [c0] 7\n", + "cpd00322_c0 L-Isoleucine [c0] 4\n", + "cpd00115_c0 dATP [c0] 7\n", + "cpd00012_c0 PPi [c0] 134\n", + "cpd00241_c0 dGTP [c0] 8\n", + "cpd00356_c0 dCTP [c0] 6\n", + "cpd00357_c0 TTP [c0] 7\n", + "cpd00002_c0 ATP [c0] 276\n", + "cpd00038_c0 GTP [c0] 20\n", + "cpd00052_c0 CTP [c0] 25\n", + "cpd00062_c0 UTP [c0] 13\n", + "cpd00008_c0 ADP [c0] 214\n", + "cpd00067_c0 H+ [c0] 896\n", + "cpd11416_c0 Biomass [c0] 2\n", + "cpd17041_c0 Protein biosynthesis [c0] 2\n", + "cpd17042_c0 DNA replication [c0] 2\n", + "cpd17043_c0 RNA transcription [c0] 2\n" + ] + } + ], + "source": [ + "for m in model.reactions.bio1.metabolites:\n", + " print(m, m.name, len(m.reactions))" ] }, { diff --git a/examples/Others/Biochem.ipynb b/examples/Others/Biochem.ipynb index 2433f4dd..00b845b8 100644 --- a/examples/Others/Biochem.ipynb +++ b/examples/Others/Biochem.ipynb @@ -4,18 +4,17 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "cobrakbase 0.2.8\n" - ] - } - ], + "outputs": [], + "source": [ + "import modelseedpy" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, "source": [ - "import modelseedpy\n", - "import cobrakbase" + "### Load the database object from local github repository\n", + "https://github.com/ModelSEED/ModelSEEDDatabase" ] }, { @@ -24,336 +23,221 @@ "metadata": {}, "outputs": [], "source": [ - "modelseed = modelseedpy.biochem.from_local('../../../ModelSEEDDatabase')" + "database_path = '../../../ModelSEEDDatabase/'\n", + "modelseed = modelseedpy.biochem.from_local(database_path)" ] }, { - "cell_type": "code", - "execution_count": 3, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Compounds" + ] + }, + { + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "modelseedpy.biochem.modelseed_biochem.ModelSEEDBiochem" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "type(modelseed)" + "### Fetch compounds" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "{'C00001', 'C01328'}" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "Pyruvate\n", + "L-Lactate\n" + ] } ], "source": [ - "modelseed.compound_aliases['cpd00001']['KEGG']" + "cpd_pyruvate = modelseed.compounds.cpd00020\n", + "print(cpd_pyruvate.name)\n", + "cpd_lactate = modelseed.compounds.get_by_id('cpd00159')\n", + "print(cpd_lactate.name)" ] }, { - "cell_type": "code", - "execution_count": 2, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "kbase = cobrakbase.KBaseAPI()" + "### Read Aliases" ] }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "12218/444/1\n" + "Aliases dict_keys(['AlgaGEM', 'AraCyc', 'AraGEM', 'BiGG', 'BiGG1', 'BrachyCyc', 'ChlamyCyc', 'CornCyc', 'DF_Athaliana', 'EcoCyc', 'JM_Creinhardtii', 'JP_Creinhardtii_MSB', 'JP_Creinhardtii_NMeth', 'KEGG', 'MaizeCyc', 'Maize_C4GEM', 'MetaCyc', 'PlantCyc', 'PoplarCyc', 'RiceCyc', 'SorghumCyc', 'SoyCyc', 'TS_Athaliana', 'iAF1260', 'iAF692', 'iAG612', 'iAO358', 'iAbaylyiv4', 'iGT196', 'iIN800', 'iIT341', 'iJN746', 'iJR904', 'iMA945', 'iMEO21', 'iMM904', 'iMO1053-PAO1', 'iMO1056', 'iND750', 'iNJ661', 'iPS189', 'iRR1083', 'iRS1563', 'iRS1597', 'iSB619', 'iSO783', 'iYO844', 'metanetx.chemical', 'SMILE', 'InChIKey', 'InChI'])\n", + "KEGG {'C00022'}\n" ] } ], "source": [ - "template = kbase.get_from_ws('CoreBacteria_updated', 12218)\n", - "print(template.info)" + "print('Aliases', cpd_pyruvate.annotation.keys())\n", + "print('KEGG', cpd_pyruvate.annotation['KEGG'])" ] }, { - "cell_type": "code", - "execution_count": 103, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "from cobrakbase.core.kbasefba.newmodeltemplate_metabolite import NewModelTemplateCompound\n", - "compounds = {}\n", - "for cc in template.compcompounds:\n", - " if cc.compound is None:\n", - " cpd = modelseed.get_seed_compound(cc.id[:-2])\n", - " if cpd.id not in compounds:\n", - " template_compound = NewModelTemplateCompound(cpd.id, cpd.formula, cpd.name)\n", - " compounds[template_compound.id] = NewModelTemplateCompound(cpd.id, cpd.formula, cpd.name)\n", - " print(cpd)" + "### Read Structures" ] }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 5, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 101, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "SMILES C[C@H](O)C(=O)[O-]\n", + "InChI InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)/p-1/t2-/m0/s1\n", + "InChI Key JVTAAEKCZFNVCJ-REOHCLBHSA-M\n" + ] } ], "source": [ - "kbase.save_object('CoreBacteria_updated', 12218, template.info.type, template)" + "print('SMILES', cpd_lactate.smiles)\n", + "print('InChI', cpd_lactate.inchi)\n", + "print('InChI Key', cpd_lactate.inchi_key)" ] }, { - "cell_type": "code", - "execution_count": 100, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "template.add_compounds(list(compounds.values()))" + "### Fetch by inchi key\n", + "`find_compounds_by_inchi_key(inchi_key, exact=True)` exact forces first and second key match `exact=False` searches by first inchi hash only" ] }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 6, "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Compound identifiercpd26984
NameDsrC-disulfide-form
Memory address0x07fc27bc0b710
FormulaC6H9N2O2R3S2
In 0 species\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 91, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "cpd00159 L-Lactate JVTAAEKCZFNVCJ-REOHCLBHSA-M\n" + ] } ], "source": [ - "\n", - "template_compound" + "for cpd in modelseed.find_compounds_by_inchi_key('JVTAAEKCZFNVCJ-REOHCLBHSA-M', True):\n", + " print(cpd, cpd.name, cpd.inchi_key)" ] }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 7, "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Compound identifiercpd26984
NameDsrC-disulfide-form
Memory address0x07fc27bb9ea50
FormulaC6H9N2O2R3S2
In 0 species\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 90, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "cpd00221 D-Lactate JVTAAEKCZFNVCJ-UWTATZPHSA-M\n", + "cpd00159 L-Lactate JVTAAEKCZFNVCJ-REOHCLBHSA-M\n", + "cpd01022 Lactate JVTAAEKCZFNVCJ-UHFFFAOYSA-M\n" + ] } ], - "source": [] + "source": [ + "for cpd in modelseed.find_compounds_by_inchi_key('JVTAAEKCZFNVCJ-REOHCLBHSA-M', False):\n", + " print(cpd, cpd.name, cpd.inchi_key)" + ] }, { - "cell_type": "code", - "execution_count": 104, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "template_reaction = template.reactions.rxa45615_c" + "# Reactions" ] }, { - "cell_type": "code", - "execution_count": 17, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': 'rxa45615_c',\n", - " 'name': 'rxa45615_c',\n", - " 'GapfillDirection': '=',\n", - " 'base_cost': 1000,\n", - " 'reverse_penalty': 1000,\n", - " 'forward_penalty': 1000,\n", - " 'upper_bound': 1000,\n", - " 'lower_bound': -1000,\n", - " 'direction': '=',\n", - " 'maxforflux': 1000,\n", - " 'maxrevflux': 1000.0,\n", - " 'reaction_ref': 'kbase/default/reactions/id/rxa45615',\n", - " 'templateReactionReagents': [{'coefficient': -2,\n", - " 'templatecompcompound_ref': '~/compcompounds/id/cpd00067_c'},\n", - " {'coefficient': -3,\n", - " 'templatecompcompound_ref': '~/compcompounds/id/cpd00971_c'},\n", - " {'coefficient': -2,\n", - " 'templatecompcompound_ref': '~/compcompounds/id/cpd11620_c'},\n", - " {'coefficient': -1,\n", - " 'templatecompcompound_ref': '~/compcompounds/id/cpd08701_c'},\n", - " {'coefficient': 2,\n", - " 'templatecompcompound_ref': '~/compcompounds/id/cpd00067_e'},\n", - " {'coefficient': 3,\n", - " 'templatecompcompound_ref': '~/compcompounds/id/cpd00971_e'},\n", - " {'coefficient': 2,\n", - " 'templatecompcompound_ref': '~/compcompounds/id/cpd11621_c'},\n", - " {'coefficient': 1,\n", - " 'templatecompcompound_ref': '~/compcompounds/id/cpd08702_c'}],\n", - " 'templatecompartment_ref': '~/compartments/id/c',\n", - " 'templatecomplex_refs': [],\n", - " 'type': 'spontaneous'}" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "template_reaction.get_data()" + "### Fetch Reactions" ] }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 140472454487120 cpd00067_c cpd00067:H+ H+\n", - " 140472433862480 cpd00971_c cpd00971:Na+ Na+\n", - " 140472454486480 cpd11620_c cpd11620:Reducedferredoxin Reducedferredoxin\n", - " 140472433931728 cpd08701_c cpd08701:Methanophenazine Methanophenazine\n", - " 140472433821840 cpd00067_e cpd00067:H+ H+\n", - " 140472433861840 cpd00971_e cpd00971:Na+ Na+\n", - " 140472433893520 cpd11621_c cpd11621:Oxidizedferredoxin Oxidizedferredoxin\n", - " 140472433931856 cpd08702_c cpd08702:Dihydromethanophenazine Dihydromethanophenazine\n" + "rxn00148: cpd00002_0 + cpd00020_0 <=> cpd00008_0 + cpd00061_0 + cpd00067_0\n", + "ATP + Pyruvate <=> ADP + Phosphoenolpyruvate + H+\n" ] } ], "source": [ - "for o in template_reaction.metabolites:\n", - " print(type(o), id(o), o.id, o.compound, o.name)" + "reaction_PYK = modelseed.reactions.rxn00148\n", + "print(reaction_PYK)\n", + "print(reaction_PYK.build_reaction_string(True))" ] }, { - "cell_type": "code", - "execution_count": 65, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'charge': 0,\n", - " 'id': 'cpd08701_c',\n", - " 'maxuptake': 0,\n", - " 'templatecompartment_ref': '~/compartments/id/c',\n", - " 'templatecompound_ref': '~/compounds/id/cpd08701'}" - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "template.compcompounds.cpd08701_c.get_data()" + "### Read Aliases" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 58, + "execution_count": 9, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "140472724747984" - ] - }, - "execution_count": 58, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "Aliases dict_keys(['AlgaGEM', 'AraCyc', 'AraGEM', 'BiGG', 'BrachyCyc', 'ChlamyCyc', 'CornCyc', 'DF_Athaliana', 'EcoCyc', 'JM_Creinhardtii', 'JP_Creinhardtii_MSB', 'JP_Creinhardtii_NMeth', 'KEGG', 'MaizeCyc', 'Maize_C4GEM', 'MetaCyc', 'PlantCyc', 'PoplarCyc', 'RiceCyc', 'SorghumCyc', 'SoyCyc', 'TS_Athaliana', 'iAF1260', 'iAF692', 'iAG612', 'iAO358', 'iGT196', 'iIN800', 'iJN746', 'iJR904', 'iMA945', 'iMEO21', 'iMM904', 'iMO1053-PAO1', 'iMO1056', 'iND750', 'iNJ661', 'iPS189', 'iRR1083', 'iRS1563', 'iRS1597', 'iSB619', 'iSO783', 'iYO844', 'metanetx.reaction', 'rhea', 'ec-code'])\n", + "KEGG {'R00200'}\n", + "ec-code {'2.7.1.40'}\n" + ] } ], "source": [ - "id(template.compcompounds.cpd08701_c.cpd08701_c)" + "print('Aliases', reaction_PYK.annotation.keys())\n", + "print('KEGG', reaction_PYK.annotation['KEGG'])\n", + "print('ec-code', reaction_PYK.annotation['ec-code'])" ] }, { - "cell_type": "code", - "execution_count": 61, + "cell_type": "markdown", "metadata": {}, + "source": [ + "### Instantiate reaction \n", + "Instantiate database reaction to a template reaction with cytosol `c` assigned to token `0`" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "tags": [] + }, "outputs": [ { "data": { @@ -361,70 +245,47 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - "
Metabolite identifiercpd08701_cReaction identifierrxn00148_c
NameNameATP:pyruvate 2-O-phosphotransferase
Memory address0x07fc25936bb900x7f5eb42f82e0
Stoichiometry\n", + "

cpd00002_c + cpd00020_c <=> cpd00008_c + cpd00061_c + cpd00067_c

\n", + "

ATP + Pyruvate <=> ADP + Phosphoenolpyruvate + H+

\n", + "
FormulaGPR
CompartmentcLower bound-1000
In 3 reaction(s)\n", - " rxn03126_c, rxa45615_c, rxn15961_cUpper bound1000
" + " \n", + " " ], "text/plain": [ - "" + "" ] }, - "execution_count": 61, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "template.compcompounds.cpd08701_c" + "template_PYK_cytosol = reaction_PYK.to_template_reaction({0: 'c'})\n", + "template_PYK_cytosol" ] }, { - "cell_type": "code", - "execution_count": 134, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(1) cpd00001[0] + (1) cpd00012[0] <=> (2) cpd00009[0] = >\n", - "(1) cpd00001[0] + (1) cpd00742[0] <=> (2) cpd00011[0] + (2) cpd00013[0] > >\n", - "(1) cpd00011[0] + (1) cpd00668[0] <=> (2) cpd00020[0] < <\n", - "(1) cpd02570[0] <=> (2) cpd00020[0] = =\n", - "(2) cpd00025[0] <=> (2) cpd00001[0] + (1) cpd00007[0] > >\n", - "(1) cpd00001[0] + (1) cpd00794[0] <=> (2) cpd00027[0] > =\n", - "(2) cpd00001[0] <=> (1) cpd00025[0] = <\n", - "(2) cpd00038[0] <=> (1) cpd00012[0] + (1) cpd00925[0] > =\n", - "(2) cpd00040[0] <=> (1) cpd00011[0] + (1) cpd00843[0] > =\n", - "(1) cpd00011[0] + (1) cpd03049[0] <=> (1) cpd00020[0] + (1) cpd00056[0] > <\n", - "(2) cpd00076[0] <=> (1) cpd00027[0] + (1) cpd02298[0] = =\n" - ] - } - ], "source": [ - "i =0 \n", - "for r in modelseed.reactions:\n", - " print(modelseed.reactions[r]['code'], modelseed.reactions[r]['direction'], modelseed.reactions[r]['reversibility'])\n", - " #print(modelseed.reactions[r]['code'])\n", - " #print(modelseed.reactions[r]['stoichiometry'])\n", - " #print(modelseed.reactions[r]['definition'])\n", - " \n", - " \n", - " i+= 1\n", - " if i > 10:\n", - " break" + "# Random debug stuff ignore for now" ] }, { @@ -532,83 +393,6 @@ "with open('/Users/fliu/workspace/jupyter/python3/annotation-server/data/extra_reactions.json', 'w') as fh:\n", " fh.write(json.dumps(extra_reactions))\n" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "modelseed.reactions.update" - ] - }, - { - "cell_type": "code", - "execution_count": 156, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "rxn45615: cpd00003 [0] + cpd00067 [0] + cpd00971 [0] + 2.0 cpd28082 [0] <=> cpd00004 [0] + cpd00971 [1] + 2.0 cpd27757 [0]\n" - ] - } - ], - "source": [ - "rxn = modelseed.get_seed_reaction('rxn45615')\n", - "print(type(rxn))\n", - "print(rxn)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': 'rxn45615',\n", - " 'abbreviation': nan,\n", - " 'name': nan,\n", - " 'code': '(1) cpd00003[0] + (1) cpd00971[0] + (2) cpd28082[0] <=> (1) cpd00004[0] + (1) cpd00971[1] + (2) cpd27757[0]',\n", - " 'stoichiometry': '-1:cpd00003:0:0:NAD;-1:cpd00067:0:0:H+;-1:cpd00971:0:0:Na+;-2:cpd28082:0:0:Reduced-ferredoxins;1:cpd00004:0:0:NADH;1:cpd00971:1:0:Na+;2:cpd27757:0:0:Oxidized-ferredoxins',\n", - " 'is_transport': 1,\n", - " 'equation': '(1) cpd00003[0] + (1) cpd00067[0] + (1) cpd00971[0] + (2) cpd28082[0] <=> (1) cpd00004[0] + (1) cpd00971[1] + (2) cpd27757[0]',\n", - " 'definition': '(1) NAD[0] + (1) H+[0] + (1) Na+[0] + (2) Reduced-ferredoxins[0] <=> (1) NADH[0] + (1) Na+[1] + (2) Oxidized-ferredoxins[0]',\n", - " 'reversibility': '?',\n", - " 'direction': '=',\n", - " 'abstract_reaction': nan,\n", - " 'pathways': nan,\n", - " 'aliases': 'MetaCyc: TRANS-RXN-276',\n", - " 'ec_numbers': '7.2.1.2',\n", - " 'deltag': 10000000.0,\n", - " 'deltagerr': 10000000.0,\n", - " 'compound_ids': 'cpd00003;cpd00004;cpd00067;cpd00971;cpd27757;cpd28082',\n", - " 'status': 'OK',\n", - " 'is_obsolete': 0,\n", - " 'linked_reaction': nan,\n", - " 'notes': 'GCP|EQP',\n", - " 'source': 'Primary Database'}" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "modelseed.reactions['rxn45615']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -627,7 +411,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.13" + "version": "3.8.10" } }, "nbformat": 4, diff --git a/modelseedpy/__init__.py b/modelseedpy/__init__.py index 7f135055..7360617e 100644 --- a/modelseedpy/__init__.py +++ b/modelseedpy/__init__.py @@ -5,33 +5,20 @@ # set the warning format to be on a single line import sys import logging +import cobra import warnings as _warnings from os import name as _name from os.path import abspath as _abspath from os.path import dirname as _dirname from modelseedpy.helpers import config -logging_hash = { - "debug": logging.DEBUG, - "critical": logging.CRITICAL, - "error": logging.ERROR, - "warning": logging.WARNING, - "info": logging.INFO, -} +__author__ = "Christopher Henry" +__email__ = "chenry@anl.gov" +__version__ = "0.4.2" -# Configuing modelseedpy logger logger = logging.getLogger(__name__) -c_handler = logging.StreamHandler() -c_handler.setLevel(logging_hash[config.get("logging", "console_level")]) -c_format = logging.Formatter("%(name)s - %(levelname)s - %(message)s") -c_handler.setFormatter(c_format) -logger.addHandler(c_handler) -if config.get("logging", "log_file") == "yes": - f_handler = logging.FileHandler(config.get("logging", "filename"), mode="a") - f_handler.setLevel(logging_hash[config.get("logging", "file_level")]) - f_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") - f_handler.setFormatter(f_format) - logger.addHandler(f_handler) + +print("modelseedpy", __version__) if sys.version_info[0] == 2: logger.warning( @@ -41,6 +28,9 @@ "still work but we will no longer actively maintain Python 2 support." ) +if "e0" not in cobra.medium.annotations.compartment_shortlist["e"]: + cobra.medium.annotations.compartment_shortlist["e"].append("e0") + import modelseedpy from modelseedpy.core import ( RastClient, @@ -48,16 +38,21 @@ MSBuilder, MSMedia, MSGrowthPhenotypes, + MSGrowthPhenotype, MSModelUtil, FBAHelper, MSEditorAPI, MSATPCorrection, MSGapfill, MSEquation, + MSModelReport, + AnnotationOntology, ) from modelseedpy.core.exceptions import * -from modelseedpy.community import MSCommunity, MSCompatibility, CommKineticPkg +#from modelseedpy.community import MSCommunity, MSCompatibility, CommKineticPkg + +from modelseedpy.biochem import ModelSEEDBiochem from modelseedpy.fbapkg import ( BaseFBAPkg, @@ -76,10 +71,11 @@ FullThermoPkg, MSPackageManager, ObjConstPkg, + ObjectivePkg, ChangeOptPkg, ElementUptakePkg, + ReactionActivationPkg, + ExpressionActivationPkg ) from modelseedpy.multiomics import MSExpression - -__version__ = "0.2.2" diff --git a/modelseedpy/biochem/modelseed_biochem.py b/modelseedpy/biochem/modelseed_biochem.py index 43cc865e..8dd9270a 100644 --- a/modelseedpy/biochem/modelseed_biochem.py +++ b/modelseedpy/biochem/modelseed_biochem.py @@ -1,12 +1,18 @@ # -*- coding: utf-8 -*- import logging +import os +import json import pandas as pd from cobra.core.dictlist import DictList from modelseedpy.biochem.modelseed_compound import ModelSEEDCompound, ModelSEEDCompound2 from modelseedpy.biochem.modelseed_reaction import ModelSEEDReaction, ModelSEEDReaction2 +from modelseedpy.helpers import config +from modelseedpy.core.msmodel import get_reaction_constraints_from_direction logger = logging.getLogger(__name__) +_BIOCHEM_FOLDER = "Biochemistry" + ALIAS_CPD_IDENTIFIERS_ORG = { "BiGG": "bigg.metabolite", "KEGG": "kegg.compound", @@ -54,6 +60,29 @@ "TS_Athaliana", } +def convert_to_searchname(name): + OriginalName = name + ending = ""; + if name[-1] == "-": + ending = "-" + name = name.lower() + name.replace(" ","") + name.replace(",","") + name.replace("-","") + name.replace("_","") + name.replace("(","") + name.replace(")","") + name.replace("}","") + name.replace("{","") + name.replace("[","") + name.replace("]","") + name.replace(":","") + name.replace("�","") + name.replace("'","") + name.replace("_","") + name += ending + name.replace("icacid","ate") + return name; def get_low(ids): low = None @@ -134,10 +163,13 @@ def load_metabolites_from_df( if cpd_id in structures: if "SMILE" in structures[cpd_id]: smiles = structures[cpd_id]["SMILE"] + aliases_annotation["SMILE"] = smiles if "InChI" in structures[cpd_id]: inchi = structures[cpd_id]["InChI"] + aliases_annotation["InChI"] = inchi if "InChIKey" in structures[cpd_id]: inchi_key = structures[cpd_id]["InChIKey"] + aliases_annotation["InChIKey"] = inchi_key inchi_key = None if pd.isna(inchi_key) or len(inchi_key) == 0 else inchi_key other_names = set() if cpd_id in names: @@ -153,9 +185,6 @@ def load_metabolites_from_df( mass, delta_g, delta_g_err, - smiles, - inchi_key, - inchi, is_core, is_obsolete, is_cofactor, @@ -174,6 +203,201 @@ def load_metabolites_from_df( return compounds +def _load_aliases_df(df_aliases, seed_index=1, source_index=3, alias_id_index=2): + aliases = {} + for i in df_aliases.itertuples(): + seed_id = i[seed_index] + alias_id = i[alias_id_index] + source = i[source_index] + if seed_id not in aliases: + aliases[seed_id] = {} + if source not in aliases[seed_id]: + aliases[seed_id][source] = set() + aliases[seed_id][source].add(alias_id) + return aliases + + +def _load_metabolites( + database_path: str, aliases=None, names=None, structures=None +) -> dict: + if aliases is None: + aliases = {} + if names is None: + names = {} + if structures is None: + structures = {} + metabolites = {} + contents = os.listdir(f"{database_path}/{_BIOCHEM_FOLDER}") + for f in contents: + if f.startswith("compound_") and f.endswith(".json"): + with open(f"{database_path}/{_BIOCHEM_FOLDER}/{f}", "r") as fh: + _compounds_data = json.load(fh) + for o in _compounds_data: + if "id" in o and o["id"]: + cpd_names = set() + if o["id"] in names: + cpd_names |= names[o["id"]] + cpd = ModelSEEDCompound2( + o["id"], + o.get("formula"), + o.get("name"), + o.get("charge"), + "", + o.get("abbreviation"), + cpd_names, + o.get("mass"), + o.get("deltag"), + o.get("deltagerr"), + o.get("is_core"), + o.get("is_obsolete"), + None, + o.get("pka"), + o.get("pkb"), + o.get("source"), + ) + if cpd.id in aliases: + cpd.annotation.update(aliases[cpd.id]) + if cpd.id in structures: + for alias_type in structures[cpd.id]: + v = structures[cpd.id][alias_type] + if len(v) == 1: + cpd.annotation[alias_type] = list(v)[0] + else: + logger.warning( + f"multiple {alias_type} structures found for {cpd.id}" + ) + metabolites[cpd.id] = cpd + else: + print("error", o) + # print(_compounds_data[0].keys()) + return metabolites + + +def build_modelseed_reaction( + o, names, aliases, ec_numbers, metabolites_indexed, metabolites +): + if "id" in o and o["id"]: + rxn_names = set() + if o["id"] in names: + rxn_names |= names[o["id"]] + ( + lower_bound, + upper_bound, + ) = get_reaction_constraints_from_direction(o.get("reversibility")) + stoichiometry = o.get("stoichiometry") + reaction_metabolites = {} + for s in stoichiometry: + cmp_token = s["compartment"] + value = s["coefficient"] + cpd = metabolites[s["compound"]] + cpd_index_id = f"{cpd.id}_{cmp_token}" + if cpd_index_id not in metabolites_indexed: + cpd_token = cpd.copy() + cpd_token.id = f"{cpd.id}_{cmp_token}" + cpd_token.base_id = cpd.id + cpd_token.compartment = cmp_token + metabolites_indexed[cpd_index_id] = cpd_token + reaction_metabolites[metabolites_indexed[cpd_index_id]] = value + rxn = ModelSEEDReaction2( + o["id"], + o.get("name"), + "", + lower_bound, + upper_bound, + "", + rxn_names, + o.get("deltag"), + o.get("deltagerr"), + o.get("is_obsolete"), + None, + o.get("status"), + o.get("source"), + ) + rxn.add_metabolites(reaction_metabolites) + if rxn.id in aliases: + rxn.annotation.update(aliases[rxn.id]) + if rxn.id in ec_numbers: + rxn.annotation["ec-code"] = ec_numbers[rxn.id] + return rxn + else: + raise ValueError("unable to build reaction") + + +def _load_reactions( + database_path: str, metabolites: dict, aliases=None, names=None, ec_numbers=None +) -> (dict, dict): + if aliases is None: + aliases = {} + if names is None: + names = {} + if ec_numbers is None: + ec_numbers = {} + reactions = {} + contents = os.listdir(f"{database_path}/{_BIOCHEM_FOLDER}") + metabolites_indexed = {} + for f in contents: + if f.startswith("reaction_") and f.endswith(".json"): + with open(f"{database_path}/{_BIOCHEM_FOLDER}/{f}", "r") as fh: + _reactions_data = json.load(fh) + for o in _reactions_data: + if "id" in o and o["id"]: + rxn_names = set() + if o["id"] in names: + rxn_names |= names[o["id"]] + ( + lower_bound, + upper_bound, + ) = get_reaction_constraints_from_direction( + o.get("reversibility") + ) + stoichiometry = o.get("stoichiometry") + reaction_metabolites = {} + for s in stoichiometry: + cmp_token = s["compartment"] + value = s["coefficient"] + cpd = metabolites[s["compound"]] + cpd_index_id = f"{cpd.id}_{cmp_token}" + if cpd_index_id not in metabolites_indexed: + cpd_token = cpd.copy() + cpd_token.id = f"{cpd.id}_{cmp_token}" + cpd_token.base_id = cpd.id + cpd_token.compartment = cmp_token + metabolites_indexed[cpd_index_id] = cpd_token + reaction_metabolites[ + metabolites_indexed[cpd_index_id] + ] = value + rxn = ModelSEEDReaction2( + o["id"], + o.get("name"), + "", + lower_bound, + upper_bound, + "", + rxn_names, + o.get("deltag"), + o.get("deltagerr"), + o.get("is_obsolete"), + None, + o.get("status"), + o.get("source"), + pathways=o.get("pathways"), + ) + if "linked_reaction" in o and o.get("linked_reaction"): + ids = o.get("linked_reaction").split(";") + rxn.annotation["modelseed"] = ids[0] + rxn.add_metabolites(reaction_metabolites) + if rxn.id in aliases: + rxn.annotation.update(aliases[rxn.id]) + if rxn.id in ec_numbers: + rxn.annotation["ec-code"] = ec_numbers[rxn.id] + metabolites[cpd.id] = cpd + reactions[rxn.id] = rxn + else: + logger.error(f"failed to read reaction record {o}") + + return reactions, metabolites_indexed + + def load_reactions_from_df( df: pd.DataFrame, database_metabolites: dict, @@ -271,16 +495,31 @@ class ModelSEEDDatabase: ModelSEED database instance. """ - def __init__(self, compounds, reactions, compound_tokens): + def __init__(self, compounds: list, reactions: list, compound_tokens: list): self.compounds = DictList() self.compound_tokens = DictList() self.reactions = DictList() self.compounds += compounds self.reactions += reactions - self.reactions += compound_tokens + self.compound_tokens += compound_tokens + self.inchi_key_lookup = {} self.metabolite_reactions = {} + self._index_inchi() + + def _index_inchi(self): + for m in self.compounds: + if m.inchi_key: + f, s, p = m.inchi_key.split("-") + if f not in self.inchi_key_lookup: + self.inchi_key_lookup[f] = {} + if s not in self.inchi_key_lookup[f]: + self.inchi_key_lookup[f][s] = set() + proton_pair = (m.id, p) + if proton_pair not in self.inchi_key_lookup[f][s]: + self.inchi_key_lookup[f][s].add(proton_pair) + def compounds_by_alias(self, alias, value): pass @@ -288,9 +527,27 @@ def reactions_by_alias(self, alias, value): pass def find_compounds_by_inchi_key(self, inchi_key, exact=True): - pass + f, s, p = inchi_key.split("-") + if exact and f in self.inchi_key_lookup and s in self.inchi_key_lookup[f]: + # x is tuple (cpd.id, protonation) + return [self.compounds.get_by_id(x[0]) for x in self.inchi_key_lookup[f][s]] + elif f in self.inchi_key_lookup and not exact: + seed_ids = set() + for s in self.inchi_key_lookup[f]: + # x is tuple (cpd.id, protonation) + seed_ids |= {x[0] for x in self.inchi_key_lookup[f][s]} + + return [self.compounds.get_by_id(seed_id) for seed_id in seed_ids] + else: + return [] + + def find_reactions_by_compounds(self, compounds, or_instead_of_and=False): + """ - def find_reactions_by_compounds(self, compounds): + @param compounds: list of seed compound ids + @param or_instead_of_and: use OR logic instead of AND (default) + @return: + """ pass def add_compound(self, cpd): @@ -310,6 +567,15 @@ def add_reaction(self, rxn): class ModelSEEDBiochem: + default_biochemistry = None + + @staticmethod + def get(create_if_missing=True,path=config.get("biochem", "path")): + if not ModelSEEDBiochem.default_biochemistry: + print("loading biochemistry database from", path) + ModelSEEDBiochem.default_biochemistry = from_local(path) + return ModelSEEDBiochem.default_biochemistry + def __init__( self, compounds, @@ -549,7 +815,7 @@ def load_database( return database -def from_local(path): +def from_local_old(path): database_repo = path reactions_url = database_repo + "/Biochemistry/reactions.tsv" compounds_url = database_repo + "/Biochemistry/compounds.tsv" @@ -617,6 +883,75 @@ def from_local(path): return modelseed +def from_local(database_path: str): + contents = os.listdir(f"{database_path}/Biochemistry/") + if "compounds.tsv" in contents: + return from_local_old(database_path) + + compound_aliases_url = ( + f"{database_path}/Biochemistry/Aliases/Unique_ModelSEED_Compound_Aliases.txt" + ) + reaction_aliases_url = ( + f"{database_path}/Biochemistry/Aliases/Unique_ModelSEED_Reaction_Aliases.txt" + ) + compound_aliases = _load_aliases_df( + pd.read_csv(compound_aliases_url, index_col=None, sep="\t") + ) + reaction_aliases = _load_aliases_df( + pd.read_csv(reaction_aliases_url, index_col=None, sep="\t") + ) + + compound_structures_url = ( + f"{database_path}/Biochemistry/Structures/Unique_ModelSEED_Structures.txt" + ) + compound_structures = _load_aliases_df( + pd.read_csv(compound_structures_url, index_col=None, sep="\t"), + source_index=2, + alias_id_index=6, + ) + + compound_names_url = ( + f"{database_path}/Biochemistry/Aliases/Unique_ModelSEED_Compound_Names.txt" + ) + reaction_names_url = ( + f"{database_path}/Biochemistry/Aliases/Unique_ModelSEED_Reaction_Names.txt" + ) + compound_names = _load_aliases_df( + pd.read_csv(compound_names_url, index_col=None, sep="\t") + ) + reaction_names = _load_aliases_df( + pd.read_csv(reaction_names_url, index_col=None, sep="\t") + ) + + reaction_ecs_url = ( + f"{database_path}/Biochemistry/Aliases/Unique_ModelSEED_Reaction_ECs.txt" + ) + reaction_ecs = _load_aliases_df( + pd.read_csv(reaction_ecs_url, index_col=None, sep="\t") + ) + + # build metabolites unpack names + metabolites = _load_metabolites( + database_path, + compound_aliases, + {k: v["name"] for k, v in compound_names.items()}, + compound_structures, + ) + + # build reactions unpack names, ecs + reactions, metabolite_tokens = _load_reactions( + database_path, + metabolites, + reaction_aliases, + {k: v["name"] for k, v in reaction_names.items()}, + {k: v["Enzyme Class"] for k, v in reaction_ecs.items()}, + ) + database = ModelSEEDDatabase( + metabolites.values(), reactions.values(), metabolite_tokens.values() + ) + return database + + def get_names_from_df(df): names = {} for t in df.itertuples(): diff --git a/modelseedpy/biochem/modelseed_compound.py b/modelseedpy/biochem/modelseed_compound.py index fdb54065..c5c73fed 100644 --- a/modelseedpy/biochem/modelseed_compound.py +++ b/modelseedpy/biochem/modelseed_compound.py @@ -1,9 +1,13 @@ # -*- coding: utf-8 -*- from modelseedpy.biochem.seed_object import ModelSEEDObject -from modelseedpy.core.mstemplate import MSTemplateSpecies +from modelseedpy.core.mstemplate import MSTemplateSpecies, MSTemplateMetabolite from cobra.core import Metabolite import pandas as pd +_SMILE_ALIAS = "SMILE" +_INCHI_ALIAS = "InChI" +_INCHI_KEY_ALIAS = "InChIKey" + class ModelSEEDCompound2(Metabolite): def __init__( @@ -18,9 +22,6 @@ def __init__( mass=None, delta_g=None, delta_g_error=None, - smiles=None, - inchi_key=None, - inchi=None, is_core=False, is_obsolete=False, is_cofactor=False, @@ -48,10 +49,6 @@ def __init__( self.delta_g = delta_g self.delta_g_error = delta_g_error - self.smiles = smiles - self.inchi_key = inchi_key - self.inchi = inchi - self.linked_compound = None self.pka = pka self.pkb = pkb @@ -60,11 +57,55 @@ def __init__( self.flags |= set(flags) def to_template_compartment_compound(self, compartment): - res = self.copy() - res.id = f"{self.seed_id}_{compartment}" - res.compartment = compartment + cpd_id = f"{self.seed_id}" + if compartment: + cpd_id += f"_{compartment}" + # build Template Compound + metabolite = MSTemplateMetabolite( + self.seed_id, + self.formula, + self.name, + self.charge, + self.mass, + self.delta_g, + self.delta_g_error, + self.is_cofactor, + self.abbr, + ) + # build Template Compartment Compound + if compartment is None: + compartment = "x" + res = MSTemplateSpecies(cpd_id, self.charge, compartment, metabolite.id) + + # assign Compound to Compartment Compound + res._template_compound = metabolite + res.annotation.update(self.annotation) return res + @property + def smiles(self): + return ( + None + if _SMILE_ALIAS not in self.annotation + else self.annotation[_SMILE_ALIAS] + ) + + @property + def inchi_key(self): + return ( + None + if _INCHI_KEY_ALIAS not in self.annotation + else self.annotation[_INCHI_KEY_ALIAS] + ) + + @property + def inchi(self): + return ( + None + if _INCHI_ALIAS not in self.annotation + else self.annotation[_INCHI_ALIAS] + ) + class ModelSEEDCompound(ModelSEEDObject): @property diff --git a/modelseedpy/biochem/modelseed_reaction.py b/modelseedpy/biochem/modelseed_reaction.py index 5d19a1b4..4c94c079 100644 --- a/modelseedpy/biochem/modelseed_reaction.py +++ b/modelseedpy/biochem/modelseed_reaction.py @@ -2,6 +2,7 @@ import math from modelseedpy.biochem.seed_object import ModelSEEDObject from cobra.core import Reaction +from modelseedpy.core.mstemplate import MSTemplateReaction def to_str2(rxn, cmp_replace=None, cpd_replace={}): @@ -133,6 +134,7 @@ def __init__( status=None, source=None, flags=None, + pathways=None, ): super().__init__(rxn_id, name, subsystem, lower_bound, upper_bound) @@ -145,26 +147,41 @@ def __init__( self.status = status self.is_obsolete = is_obsolete + if self.is_obsolete: + self.is_obsolete = True + else: + self.is_obsolete = False self.is_abstract = is_abstract - self.delta_g = delta_g - self.delta_g_error = delta_g_error + self.delta_g = float(delta_g) if delta_g is not None else None + self.delta_g_error = float(delta_g_error) if delta_g_error is not None else None + + # removing symbolic high values representing null/none + if self.delta_g is not None and self.delta_g > 10000: + self.delta_g = None + if self.delta_g_error is not None and self.delta_g_error > 10000: + self.delta_g_error = None self.flags = set() if flags: self.flags |= set(flags) + self.pathways = pathways + @property def compound_ids(self): - pass + return None def to_template_reaction(self, compartment_setup=None): if compartment_setup is None: raise ValueError("invalid compartment setup") from modelseedpy.core.msmodel import get_cmp_token + rxn_id = f"{self.id}" reaction_compartment = get_cmp_token(compartment_setup.values()) - rxn_id = f"{self.id}_{reaction_compartment}" + if reaction_compartment: + rxn_id += f"_{reaction_compartment}" + name = f"{self.name}" metabolites = {} for m, v in self.metabolites.items(): @@ -178,10 +195,11 @@ def to_template_reaction(self, compartment_setup=None): # if len(str(index)) > 0: # name = f'{self.name} [{compartment}]' - reaction = Reaction( - rxn_id, name, self.subsystem, self.lower_bound, self.upper_bound + reaction = MSTemplateReaction( + rxn_id, self.id, name, self.subsystem, self.lower_bound, self.upper_bound ) reaction.add_metabolites(metabolites) + reaction.annotation.update(self.annotation) return reaction @property diff --git a/modelseedpy/community/__init__.py b/modelseedpy/community/__init__.py index ebf07888..1182aa6c 100644 --- a/modelseedpy/community/__init__.py +++ b/modelseedpy/community/__init__.py @@ -4,7 +4,11 @@ # import pyximport; pyximport.install(language_level=3) # improve computational speed -from modelseedpy.community.mscommunity import * -from modelseedpy.community.dfbapkg import dFBAPkg -from modelseedpy.community.mscompatibility import MSCompatibility +#from modelseedpy.community.mscommunity import * +#from modelseedpy.community.datastandardization import * from modelseedpy.community.commkineticpkg import CommKineticPkg +from modelseedpy.community.mscompatibility import MSCompatibility +from modelseedpy.community.mssteadycom import MSSteadyCom +from modelseedpy.community.commphitting import CommPhitting +from modelseedpy.community.commhelper import build_from_species_models, phenotypes +from modelseedpy.community.mskineticsfba import MSKineticsFBA diff --git a/modelseedpy/community/commhelper.py b/modelseedpy/community/commhelper.py new file mode 100644 index 00000000..b17f8867 --- /dev/null +++ b/modelseedpy/community/commhelper.py @@ -0,0 +1,530 @@ +from modelseedpy.core.msminimalmedia import minimizeFlux_withGrowth, bioFlux_check +from modelseedpy.core.exceptions import NoFluxError, ObjectiveError +from modelseedpy.fbapkg.mspackagemanager import MSPackageManager +from modelseedpy.core.msmodelutl import MSModelUtil +from modelseedpy.core.fbahelper import FBAHelper +from cobra import Model, Reaction, Metabolite +from cobra.medium import minimal_medium + +# from commscores import GEMCompatibility +from cobra.flux_analysis import pfba +from collections import OrderedDict +from optlang.symbolics import Zero +from optlang import Constraint +from math import inf, isclose +from pandas import DataFrame +from pprint import pprint +from numpy import mean +import re + + +def strip_comp(ID): + ID = ID.replace("-", "~") + return re.sub("(\_\w\d)", "", ID) + + +def export_lp(model, name): + with open(f"{name}.lp", "w") as out: + out.write(model.solver.to_lp()) + + +def correct_nonMSID(nonMSobject, output, model_index): + name, compartment = output + index = 0 if compartment == "e" else model_index + nonMSobject.compartment = compartment + str(index) + comp = re.search(r"(_[a-z]\d+$)", nonMSobject.id) + if comp is None and rf"[{compartment}]" in nonMSobject.id: + return nonMSobject.id.replace( + rf"[{compartment}]", f"_{nonMSobject.compartment}" + ) + elif comp is None: + return nonMSobject.id + f"_{nonMSobject.compartment}" + return "_".join([nonMSobject.id.replace(comp.group(), ""), nonMSobject.compartment]) + + +def build_from_species_models( + org_models, + model_id=None, + name=None, + abundances=None, + standardize=False, + MSmodel=True, + commkinetics=True, + copy_models=True, + printing=False, +): + """Merges the input list of single species metabolic models into a community metabolic model + + Parameters + ---------- + org_models : list to be merged into a community model + model_id : string specifying community model ID + name : string specifying community model name + names : list human-readable names for models being merged + abundances : dict relative abundances for input models in community model + cobra_model : bool for whether the raw COBRA model is returned + standardize: bool for whether the exchanges of each member model will be standardized (True) or just aligned. + + Returns + ------- + Cobra.Model for the desired Community + + Raises + ------ + """ + # construct the new model + models = org_models # if not standardize else GEMCompatibility.standardize( + # org_models, exchanges=True, conflicts_file_name='exchanges_conflicts.json') + biomass_indices = [] + biomass_index = minimal_biomass_index = 2 + new_metabolites, new_reactions = set(), set() + member_biomasses = {} + for model_index, org_model in enumerate(models): + model_util = MSModelUtil(org_model, copy=copy_models) + model_reaction_ids = [rxn.id for rxn in model_util.model.reactions] + model_index += 1 + # if MSmodel: + # Rename metabolites + for met in model_util.model.metabolites: + # Renaming compartments + output = MSModelUtil.parse_id(met) + if printing: + print(met, output) + if output is None: + if printing: + print( + f"The {met.id} ({output}; {hasattr(met, 'compartment')}) is unpredictable." + ) + met.id = correct_nonMSID(met, (met.id, "c"), model_index) + elif len(output) == 2: + met.id = correct_nonMSID(met, output, model_index) + elif len(output) == 3: + name, compartment, out_index = output + index = 0 if compartment == "e" else model_index + if out_index == "": + met.id += str(index) + met.compartment += str(index) + elif compartment == "e": + met.compartment = "e0" + else: + met.compartment = compartment + str(index) + met.id = name + "_" + met.compartment + new_metabolites.add(met) + if "cpd11416_c" in met.id or "biomass" in met.id: + member_biomasses[org_model.id] = met + # Rename reactions + for ( + rxn + ) in ( + model_util.model.reactions + ): # !!! all reactions should have a non-zero compartment index + if rxn.id[0:3] != "EX_": + ## biomass reactions + if re.search("^(bio)(\d+)$", rxn.id): + index = int(re.sub(r"(^bio)", "", rxn.id)) + if biomass_index == 2: + while f"bio{biomass_index}" in model_reaction_ids: + biomass_index += 1 + if index not in biomass_indices and index >= minimal_biomass_index: + biomass_indices.append(index) + else: # biomass indices can be decoupled from the respective reaction indices of the same model + rxn.id = "bio" + str(biomass_index) + if rxn.id not in model_reaction_ids: + biomass_indices.append(biomass_index) + else: + index = minimal_biomass_index + rxn.id = "bio" + str(index) + while ( + rxn.id not in model_reaction_ids + and index not in biomass_indices + ): + index += 1 + rxn.id = "bio" + str(index) + biomass_indices.append(index) + biomass_index += 1 + ## non-biomass reactions + else: + initialID = str(rxn.id) + output = MSModelUtil.parse_id(rxn) + if output is None: + if printing: + print( + f"The {rxn.id} ({output}; {hasattr(rxn, 'compartment')}) is unpredictable." + ) + try: + rxn.id = correct_nonMSID(rxn, (rxn.id, "c"), model_index) + except ValueError: + pass + elif len(output) == 2: + rxn.id = correct_nonMSID(rxn, output, model_index) + elif len(output) == 3: + name, compartment, index = output + if compartment != "e": + rxn.name = f"{name}_{compartment}{model_index}" + rxn_id = re.search(r"(.+\_\w)(?=\d+)", rxn.id).group() + if index == "": + rxn.id += str(model_index) + else: + rxn.id = rxn_id + str(model_index) + finalID = str(rxn.id) + string_diff = "" + for index, let in enumerate(finalID): + if ( + index >= len(initialID) + or index < len(initialID) + and let != initialID[index] + ): + string_diff += let + if string_diff != f"_{compartment}{model_index}" and printing: + print( + f"The ID {initialID} is changed with {string_diff} to create the final ID {finalID}" + ) + new_reactions.add(rxn) + # else: + # # TODO develop a method for compartmentalizing models without editing all reaction IDs or assuming their syntax + # pass + # adds only unique reactions and metabolites to the community model + newmodel = Model( + model_id or "+".join([model.id for model in models]), + name or "+".join([model.name for model in models]), + ) + newmodel.add_reactions(FBAHelper.filter_cobra_set(new_reactions)) + newmodel.add_metabolites(FBAHelper.filter_cobra_set(new_metabolites)) + + # Create community biomass + comm_biomass = Metabolite("cpd11416_c0", None, "Community biomass", 0, "c0") + metabolites = {comm_biomass: 1} + ## constrain the community abundances + if abundances: + abundances = { + met: abundances[memberID] for memberID, met in member_biomasses.items() + } + else: + abundances = { + cpd: -1 / len(member_biomasses) for cpd in member_biomasses.values() + } + ## define community biomass components + metabolites.update(abundances) + comm_biorxn = Reaction(id="bio1", name="bio1", lower_bound=0, upper_bound=1000) + comm_biorxn.add_metabolites(metabolites) + newmodel.add_reactions([comm_biorxn]) + # update model components + newutl = MSModelUtil(newmodel) + newutl.add_objective(comm_biorxn.flux_expression) + newutl.model.add_boundary( + comm_biomass, "sink" + ) # Is a sink reaction for reversible cpd11416_c0 consumption necessary? + ## proportionally limit the fluxes to their abundances + if commkinetics: + add_commkinetics(newutl, models, member_biomasses, abundances) + # add the metadata of community composition + if hasattr(newutl.model, "_context"): + newutl.model._contents.append(member_biomasses) + elif hasattr(newutl.model, "notes"): + newutl.model.notes.update(member_biomasses) + # print([cons.name for cons in newutl.model.constraints]) + return newutl.model + + +def add_commkinetics(util, models, member_biomasses, abundances): + # TODO this creates an error with the member biomass reactions not being identified in the model + coef = {} + for model in models: + coef[member_biomasses[model.id]] = -abundances[member_biomasses[model.id]] + for rxn in model.reactions: + if rxn.id[:3] == "rxn": + coef[rxn.forward_variable] = coef[rxn.reverse_variable] = 1 + util.create_constraint( + Constraint(Zero, name="member_flux_limit"), coef=coef, printing=True + ) + + +def phenotypes(community_members, phenotype_flux_threshold=0.1, solver: str = "glpk"): + # log information of each respective model + models = OrderedDict() + solutions = [] + media_conc = set() + # calculate all phenotype profiles for all members + comm_members = community_members.copy() + # print(community_members) + for ( + org_model, + content, + ) in ( + community_members.items() + ): # community_members excludes the stationary phenotype + print("\n", org_model.id) + org_model.solver = solver + all_phenotypes = "phenotypes" not in content + model_util = MSModelUtil(org_model, True) + if "org_coef" not in locals(): + org_coef = { + model_util.model.reactions.get_by_id( + "EX_cpd00007_e0" + ).reverse_variable: -1 + } + model_util.standard_exchanges() + models[org_model.id] = { + "exchanges": model_util.exchange_list(), + "solutions": {}, + "name": content["name"], + } + phenotypes = ( + { + met.name: {"consumed": met.id.replace("EX_", "").replace("_e0", "")} + for met in model_util.carbon_exchange_mets_list(include_unknown=False) + } + if all_phenotypes + else content["phenotypes"] + ) + # print(phenotypes) + models[org_model.id]["phenotypes"] = ["stationary"] + [ + content["phenotypes"].keys() for member, content in comm_members.items() + ] + phenoRXNs = [ + pheno_cpd + for pheno, pheno_cpds in content["phenotypes"].items() + for pheno_cpd in pheno_cpds["consumed"] + ] + media = {cpd: 100 for cpd, flux in model_util.model.medium.items()} + # TODO correct or remove the media, since it seems to be overwritten by the optimization of all carbon exchanges + ### eliminate hydrogen absorption + media.update({"EX_cpd11640_e0": 0}) + past_phenoRXNs = [] + for name, phenoCPDs in phenotypes.items(): + pheno_util = MSModelUtil(model_util.model, True) + metID = phenoCPDs["consumed"][0] + try: + phenoRXN = pheno_util.model.reactions.get_by_id(f"EX_{metID}_e0") + if past_phenoRXNs: + del media[past_phenoRXNs[-1]] + except Exception as e: + print(e, f"\nEX_{metID}_e0 is not in the model {org_model.id}") + continue + media.update({phenoRXN.id: 100}) + pheno_util.add_medium(media) + print(phenoRXN.id) + pheno_util.model.solver = solver + ### define an oxygen absorption relative to the phenotype carbon source + # O2_consumption: EX_cpd00007_e0 <= phenotype carbon source # formerly <= 2 * sum(primary carbon fluxes) + coef = org_coef.copy() + coef.update({phenoRXN.reverse_variable: 1}) + pheno_util.create_constraint( + Constraint(Zero, lb=0, ub=None, name="EX_cpd00007_e0_limitation"), + coef=coef, + ) + + ## minimize the influx of all carbonaceous exchanges, mostly non-phenotype compounds, at a fixed biomass growth + min_growth = float(1) # arbitrarily assigned minimal growth + pheno_util.add_minimal_objective_cons(min_growth) + phenoRXN.upper_bound = 0 + for ex in pheno_util.carbon_exchange_list(): + exMet = ex.id.replace("EX_", "").replace("_e0", "") + if exMet in phenoRXNs and exMet != metID: + ex.lower_bound = 0 + # print(f"The new bounds of {exMet} exchange are: {ex.bounds}") + pheno_util.add_objective( + Zero, + "min", + coef={ + ex.reverse_variable: 1000 if ex.id != phenoRXN.id else 1 + for ex in pheno_util.carbon_exchange_list() + }, + ) + # export_lp(pheno_util.model, f"minimize_cInFlux_{phenoRXN.id}") + sol = pheno_util.model.optimize() + if sol.status != "optimal": + pheno_util.model.remove_cons_vars(["EX_cpd00007_e0_limitation"]) + coef.update({phenoRXN.reverse_variable: 5}) + pheno_util.create_constraint( + Constraint(Zero, lb=0, ub=None, name="EX_cpd00007_e0_limitation"), + coef=coef, + ) + sol = pheno_util.model.optimize() + bioFlux_check(pheno_util.model, sol) + ### limit maximum consumption to the values from the previous minimization + for ex in pheno_util.carbon_exchange_list(): + #### (limiting the reverse_variable is more restrictive than the net flux variable) + if ex.id != phenoRXN.id: + ex.reverse_variable.ub = abs(min(0, sol.fluxes[ex.id])) + + ## maximize the phenotype yield with the previously defined growth and constraints + pheno_util.add_objective(phenoRXN.reverse_variable, "min") + # export_lp(pheno_util.model, f"maximize_phenoYield_{phenoRXN.id}") + pheno_sol = pheno_util.model.optimize() + bioFlux_check(pheno_util.model, pheno_sol) + pheno_influx = pheno_sol.fluxes[phenoRXN.id] + if pheno_influx >= 0: + if not all_phenotypes: + print( + f"The phenotype carbon source has a flux of {pheno_sol.fluxes[phenoRXN.id]}." + ) + pprint( + { + rxn: flux + for rxn, flux in pheno_sol.fluxes.items() + if flux != 0 + } + ) + # TODO gapfill the model in media the non-functioning carbon source + raise NoFluxError( + f"The (+) net flux of {pheno_influx} for the {phenoRXN.id} phenotype" + f" indicates that it is an implausible phenotype." + ) + print( + f"NoFluxError: The (+) net flux of {pheno_influx} for the {phenoRXN.id}" + " phenotype indicates that it is an implausible phenotype." + ) + continue + phenoRXN.lower_bound = phenoRXN.upper_bound = pheno_influx + + ## maximize excretion of all potential carbon byproducts whose #C's < phenotype source #C's + phenotype_source_carbons = FBAHelper.rxn_mets_list(phenoRXN)[0].elements[ + "C" + ] + minimum_fluxes = {} + for carbon_source in pheno_util.carbon_exchange_list(include_unknown=False): + if ( + 0 + < FBAHelper.rxn_mets_list(carbon_source)[0].elements["C"] + < phenotype_source_carbons + ): + pheno_util.add_objective(carbon_source.flux_expression, "max") + minObj = pheno_util.model.slim_optimize() + # print(carbon_source.reaction, "\t", carbon_source.flux_expression, "\t", minObj) + if minObj > phenotype_flux_threshold: + minimum_fluxes[carbon_source.id] = minObj + # TODO limit the possible excreted compounds to only those that are defined in the media + excreted_compounds = list( + [exID for exID in minimum_fluxes.keys() if exID != "EX_cpd00011_e0"] + ) + # minimum_fluxes_df = DataFrame(data=list(minimum_fluxes.values()), index=excreted_compounds, columns=["min_flux"]) + # max_excretion_cpd = minimum_fluxes_df["minimum"].idxmin() + ### optimize the excretion of the discovered phenotype excreta + if "excreted" in phenoCPDs: + phenoCPDs["excreted"] = [ + f"EX_{cpd}_e0" for cpd in phenoCPDs["excreted"] + ] + phenoCPDs["excreted"].extend(excreted_compounds) + else: + phenoCPDs["excreted"] = excreted_compounds + pheno_excreta = [ + pheno_util.model.reactions.get_by_id(excreta) + for excreta in phenoCPDs["excreted"] + ] + pheno_util.add_objective( + sum([ex.flux_expression for ex in pheno_excreta]), "max" + ) + # export_lp(pheno_util.model, "maximize_excreta") + sol = pheno_util.model.optimize() + bioFlux_check(pheno_util.model, sol) + for ex in pheno_excreta: + ex.lower_bound = ex.upper_bound = sol.fluxes[ex.id] + + ## minimize flux of the total simulation flux through pFBA + # TODO discover why some phenotypes are infeasible with pFBA + try: + pheno_sol = pfba(pheno_util.model) + # pheno_util.add_objective(sum([rxn.flux_expression for rxn in pheno_util.e]), "min") + # pheno_sol = pheno_util.model.optimize() + except Exception as e: + print( + f"The {phenoRXN.id} phenotype of the {pheno_util.model} model is " + f"unable to be simulated with pFBA and yields a < {e} > error." + ) + sol_dict = FBAHelper.solution_to_variables_dict(pheno_sol, pheno_util.model) + simulated_growth = sum( + [ + flux + for var, flux in sol_dict.items() + if re.search(r"(^bio\d+$)", var.name) + ] + ) + if not isclose(simulated_growth, min_growth): + display( + [ + (rxn, flux) + for rxn, flux in pheno_sol.fluxes.items() + if "EX_" in rxn and flux != 0 + ] + ) + raise ObjectiveError( + f"The assigned minimal_growth of {min_growth} was not optimized" + f" during the simulation, where the observed growth was {simulated_growth}." + ) + + ## store solution fluxes and update the community_members phenotypes + met_name = strip_comp(name).replace(" ", "-") + col = content["name"] + "_" + met_name + models[pheno_util.model.id]["solutions"][col] = pheno_sol + solutions.append( + models[pheno_util.model.id]["solutions"][col].objective_value + ) + met_name = met_name.replace("_", "-").replace("~", "-") + if all_phenotypes: + if "phenotypes" not in comm_members[org_model]: + comm_members[org_model]["phenotypes"] = { + met_name: {"consumed": [strip_comp(metID)]} + } + if met_name not in comm_members[org_model]["phenotypes"]: + comm_members[org_model]["phenotypes"].update( + {met_name: {"consumed": [strip_comp(metID)]}} + ) + else: + comm_members[org_model]["phenotypes"][met_name]["consumed"] = [ + strip_comp(metID) + ] + met_pheno = content["phenotypes"][met_name] + if ( + "excreted" in met_pheno + and strip_comp(metID) in met_pheno["excreted"] + ): + comm_members[org_model]["phenotypes"][met_name].update( + {"excreted": met_pheno} + ) + past_phenoRXNs.append(phenoRXN.id) + + # construct the parsed table of all exchange fluxes for each phenotype + cols = {} + ## biomass row + cols["rxn"] = ["bio"] + for content in models.values(): + for col in content["solutions"]: + cols[col] = [0] + if col not in content["solutions"]: + continue + bio_rxns = [x for x in content["solutions"][col].fluxes.index if "bio" in x] + flux = mean( + [ + content["solutions"][col].fluxes[rxn] + for rxn in bio_rxns + if content["solutions"][col].fluxes[rxn] != 0 + ] + ) + cols[col] = [flux] + ## exchange reactions rows + looped_cols = cols.copy() + looped_cols.pop("rxn") + for content in models.values(): + for ex_rxn in content["exchanges"]: + cols["rxn"].append(ex_rxn.id) + for col in looped_cols: + ### reactions that are not present in the columns are ignored + flux = ( + 0 + if ( + col not in content["solutions"] + or ex_rxn.id not in list(content["solutions"][col].fluxes.index) + ) + else content["solutions"][col].fluxes[ex_rxn.id] + ) + cols[col].append(flux) + ## construct the DataFrame + fluxes_df = DataFrame(data=cols) + fluxes_df.index = fluxes_df["rxn"] + fluxes_df.drop("rxn", axis=1, inplace=True) + fluxes_df = fluxes_df.groupby(fluxes_df.index).sum() + fluxes_df = fluxes_df.loc[(fluxes_df != 0).any(axis=1)] + fluxes_df.astype(str) + # fluxes_df.to_csv("fluxes.csv") + return fluxes_df, comm_members diff --git a/modelseedpy/community/commkineticpkg.py b/modelseedpy/community/commkineticpkg.py index 81e1479e..c84a122c 100644 --- a/modelseedpy/community/commkineticpkg.py +++ b/modelseedpy/community/commkineticpkg.py @@ -30,7 +30,8 @@ def build_package(self, kinetic_coef, community_model=None): def build_constraint(self, species): coef = { - species.biomasses[0].forward_variable: -1 * self.parameters["kinetic_coef"] + species.biomasses[0].forward_variable: -1 * self.parameters["kinetic_coef"], + species.biomasses[0].reverse_variable: self.parameters["kinetic_coef"] } for reaction in self.model.reactions: if ( diff --git a/modelseedpy/community/commphitting.py b/modelseedpy/community/commphitting.py new file mode 100644 index 00000000..4ec1937d --- /dev/null +++ b/modelseedpy/community/commphitting.py @@ -0,0 +1,2537 @@ +# -*- coding: utf-8 -*- +# from modelseedpy.fbapkg.mspackagemanager import MSPackageManager +from modelseedpy.core.exceptions import ( + FeasibilityError, + ParameterError, + ObjectAlreadyDefinedError, + NoFluxError, +) +from modelseedpy.core.optlanghelper import ( + OptlangHelper, + Bounds, + tupVariable, + tupConstraint, + tupObjective, + isIterable, + define_term, +) +from modelseedpy.community.datastandardization import GrowthData +from modelseedpy.core.fbahelper import FBAHelper +from modelseedpy.biochem import from_local +from scipy.constants import hour, minute +from zipfile import ZipFile, ZIP_LZMA +from optlang import Model, Objective +from time import sleep, process_time +from typing import Union, Iterable +from optlang.symbolics import Zero +from scipy.optimize import newton +from matplotlib import pyplot +from math import inf, isclose +#from deepdiff import DeepDiff +from pandas import DataFrame +from itertools import chain +from pprint import pprint +from h5py import File +import numpy as np +import cobra.io + +# from cplex import Cplex +import warnings, logging, json, os, re + +logger = logging.getLogger(__name__) + + +def dict_keys_exists(dic, *keys): + result = keys[0] in dic + if keys[0] in dic: + remainingKeys = keys[1:] + if len(remainingKeys) > 0: + result = dict_keys_exists(dic[keys[0]], *remainingKeys) + return result + return result + + +def find_dic_number(dic): + for k, v in dic.items(): + if FBAHelper.isnumber(v): + return v + num = find_dic_number(dic[k]) + return num + + +def trial_contents(short_code, indices_tup, values): + matches = [ele == short_code for ele in indices_tup] + return np.array(values)[matches] + + +def dic_keys(dic): + keys = [] + if isinstance(dic, dict): + for key, value in dic.items(): + keys.append(key) + keys.extend(dic_keys(value)) + return keys + + +# define data objects +def _name(name, suffix, short_code, timestep, names): + name = "-".join( + [x for x in list(map(str, [name + suffix, short_code, timestep])) if x] + ) + if name not in names: + names.append(name) + return name + else: + pprint(names) + raise ObjectAlreadyDefinedError( + f"The object {name} is already defined for the problem." + ) + + +def _export_model_json(json_model, path): + with open(path, "w") as lp: + json.dump(json_model, lp, indent=3) + + +def _met_id_parser(met): + met_id = re.sub("(\_\w\d+)", "", met) + met_id = met_id.replace("EX_", "", 1) + met_id = met_id.replace("c_", "", 1) + return met_id + + +# define an entity as a variable or a constant +def _obj_val( + primal, name, pheno, short_code, timestep, bounds, data_timestep_hr, names +): + time_hr = int(timestep) * data_timestep_hr + return ( + tupVariable(_name(name, pheno, short_code, timestep, names), Bounds=bounds) + if not primal + else primal[short_code][name + pheno][time_hr] + ) + + +def _michaelis_menten(conc, vmax, km): + return (conc * vmax) / (km + conc) + + +def clamp(val, minimum, maximum): + return min(max(val, minimum), maximum) + + +# parse primal values for use in the optimization loops +def parse_primals(primal_values, entity_labels=None, coefs=None, kcat_vals=None): + if kcat_vals: + kcat_primal = {} + for trial, content in primal_values.items(): + for primal, time_value in content.items(): + if "bin" not in primal: + continue + name, trial = primal.split("-") + number = re.search(r"(\d)", name).group() + species, pheno = re.sub(r"(bin\d_)", "", name).split("_") + if "stationary" in pheno: + continue + if species not in kcat_primal: + kcat_primal[species] = {} + if pheno not in kcat_primal[species]: + kcat_primal[species][pheno] = 0 + # kcat_(k,new) = sum_z^Z ( kcat_z * bin_k^z ) * kcat_(k,old) < 10 + if time_value == 0 and kcat_primal[species][pheno] < 10: + kcat_primal[species][pheno] += ( + coefs[int(number) - 1] * kcat_vals[species][pheno] + ) + kcat_primal[species][pheno] = clamp( + kcat_primal[species][pheno], 1e-4, 10 + ) + return kcat_primal + select_primals = {} + for trial, entities in primal_values.items(): + select_primals[trial] = {} + for entity, times in entities.items(): + # a poor man's dictionary copy + if any([label in entity for label in entity_labels]): + select_primals[trial][entity] = dict(list(times.items())) + return select_primals + + +def signal_species(signal): + return signal.split(":")[0].replace(" ", "_") + + +def _partition_coefs(initial_val, divisor): + return ( + initial_val, + initial_val / divisor, + initial_val / divisor**2, + initial_val / divisor**3, + initial_val / divisor**4, + ) + + +biomass_partition_coefs = [ + _partition_coefs(10, 10), + _partition_coefs(2, 2), + _partition_coefs(1, 3), +] + + +class CommPhitting: + + def __init__( + self, + msdb_path, + community_members: dict = None, + fluxes_df=None, + data_df=None, + carbon_conc=None, + media_conc=None, + experimental_metadata=None, + base_media=None, + solver: str = "glpk", + all_phenotypes=True, + data_paths: dict = None, + species_abundances: str = None, + ignore_trials: Union[dict, list] = None, + ignore_timesteps: list = None, + species_identities_rows=None, + significant_deviation: float = 2, + extract_zip_path: str = None, + determine_requisite_biomass: bool = True, + consumed_mets: iter = None, + ): + self.msdb = from_local(msdb_path) + self.msdb_path = msdb_path + self.solver = solver + self.all_phenotypes = all_phenotypes + self.data_paths = data_paths + self.species_abundances = species_abundances + self.ignore_trials = ignore_trials + self.ignore_timesteps = ignore_timesteps + self.species_identities_rows = species_identities_rows + self.significant_deviation = significant_deviation + self.extract_zip_path = extract_zip_path + + self.community_members = community_members + self.consumed_mets = consumed_mets or set( + [ + met + for content in community_members.values() + for met in content["phenotypes"] + ] + ) + if community_members is not None or any( + [x is None for x in [fluxes_df, data_df]] + ): + ( + self.experimental_metadata, + data_df, + fluxes_df, + carbon_conc, + self.requisite_biomass, + self.trial_name_conversion, + self.data_timestep_hr, + simulation_timestep, + media_conc, + ) = GrowthData.process( + community_members, + base_media, + solver, + all_phenotypes, + data_paths, + species_abundances, + carbon_conc, + ignore_trials, + ignore_timesteps, + species_identities_rows, + significant_deviation, + extract_zip_path, + determine_requisite_biomass, + ) + # for content in community_members.values() for met in content["phenotypes"]] + self.fluxes_tup = FBAHelper.parse_df(fluxes_df) + self.fluxes_df = fluxes_df + self.data_df = data_df + self.default_excreta = [ + index for index, row in fluxes_df.iterrows() if any(row > 1) + ] + self.parameters, self.variables, self.constraints = {}, {}, {} + self.zipped_output, self.plots, self.names = [], [], [] + self.experimental_metadata = experimental_metadata + self.carbon_conc = carbon_conc + self.media_conc = media_conc + + #################### FITTING PHASE METHODS #################### + + def fit_kcat( + self, + parameters: dict = None, + mets_to_track: list = None, + rel_final_conc: dict = None, + zero_start: list = None, + abs_final_conc: dict = None, + graphs: list = None, + data_timesteps: dict = None, + export_zip_name: str = None, + export_parameters: bool = True, + requisite_biomass: dict = None, + export_lp: str = f"solveKcat.lp", + figures_zip_name: str = None, + publishing=True, + primals_export_path=None, + ): + if export_zip_name and os.path.exists(export_zip_name): + os.remove(export_zip_name) + kcat_primal = None + requisite_biomass = requisite_biomass or self.requisite_biomass + for index, coefs in enumerate(biomass_partition_coefs): + # solve for growth rate constants with the previously solved biomasses + newSim = CommPhitting( + self.msdb_path, + None, + self.fluxes_df, + self.data_df, + self.carbon_conc, + self.media_conc, + self.experimental_metadata, + None, + self.solver, + self.all_phenotypes, + self.data_paths, + self.species_abundances, + self.ignore_trials, + self.ignore_timesteps, + self.species_identities_rows, + self.significant_deviation, + self.extract_zip_path, + True, + self.consumed_mets, + ) + newSim.define_problem( + parameters, + mets_to_track, + rel_final_conc, + zero_start, + abs_final_conc, + data_timesteps, + export_zip_name, + export_parameters, + export_lp, + kcat_primal, + coefs, + requisite_biomass, + ) + newSim.compute( + graphs, + export_zip_name, + figures_zip_name, + publishing, + primals_export_path or re.sub(r"(.lp)", ".json", export_lp), + ) + kcat_primal = parse_primals( + newSim.values, coefs=coefs, kcat_vals=newSim.parameters["kcat"] + ) + pprint(kcat_primal) + print(f"Interation {index+1} is complete\n") + kcats = {k: val for k, val in newSim.values.items() if "kcat" in k} + DataFrame(kcats).T.to_csv("pheno_growth_kcat.tsv", sep="\t") + return kcats + + def fit( + self, + parameters: dict = None, + mets_to_track: list = None, + rel_final_conc: dict = None, + zero_start: list = None, + abs_final_conc: dict = None, + graphs: list = None, + data_timesteps: dict = None, + export_zip_name: str = None, + export_parameters: bool = True, + requisite_biomass: dict = None, + export_lp: str = "CommPhitting.lp", + figures_zip_name: str = None, + publishing: bool = False, + primals_export_path=None, + ): + if hasattr(self, "requisite_biomass"): + requisite_biomass = self.requisite_biomass + self.define_problem( + parameters, + mets_to_track, + rel_final_conc, + zero_start, + abs_final_conc, + data_timesteps, + export_zip_name, + export_parameters, + export_lp, + None, + None, + requisite_biomass, + ) + self.compute( + graphs, + export_zip_name, + figures_zip_name, + publishing, + primals_export_path or re.sub(r"(.lp)", ".json", export_lp), + ) + + def define_b_vars(self, pheno, short_code, timestep, variables): + self.variables["b_" + pheno][short_code][timestep] = tupVariable( + _name("b_", pheno, short_code, timestep, self.names), Bounds(0, 1000) + ) + self.variables["b1_" + pheno][short_code][timestep] = tupVariable( + _name("b1_", pheno, short_code, timestep, self.names), Bounds(0, 1000) + ) + self.variables["b2_" + pheno][short_code][timestep] = tupVariable( + _name("b2_", pheno, short_code, timestep, self.names), Bounds(0, 1000) + ) + self.variables["b3_" + pheno][short_code][timestep] = tupVariable( + _name("b3_", pheno, short_code, timestep, self.names), Bounds(0, 1000) + ) + self.variables["b4_" + pheno][short_code][timestep] = tupVariable( + _name("b4_", pheno, short_code, timestep, self.names), Bounds(0, 1000) + ) + self.variables["b5_" + pheno][short_code][timestep] = tupVariable( + _name("b5_", pheno, short_code, timestep, self.names), Bounds(0, 1000) + ) + variables.extend( + [ + self.variables["b_" + pheno][short_code][timestep], + self.variables["b1_" + pheno][short_code][timestep], + self.variables["b2_" + pheno][short_code][timestep], + self.variables["b3_" + pheno][short_code][timestep], + self.variables["b4_" + pheno][short_code][timestep], + self.variables["b5_" + pheno][short_code][timestep], + ] + ) + if short_code not in self.variables[f"bin1_{pheno}"]: + self.variables[f"bin1_{pheno}"][short_code] = tupVariable( + _name("bin1_", pheno, short_code, "", self.names), + Bounds(0, 1), + "binary", + ) + self.variables[f"bin2_{pheno}"][short_code] = tupVariable( + _name("bin2_", pheno, short_code, "", self.names), + Bounds(0, 1), + "binary", + ) + self.variables[f"bin3_{pheno}"][short_code] = tupVariable( + _name("bin3_", pheno, short_code, "", self.names), + Bounds(0, 1), + "binary", + ) + self.variables[f"bin4_{pheno}"][short_code] = tupVariable( + _name("bin4_", pheno, short_code, "", self.names), + Bounds(0, 1), + "binary", + ) + self.variables[f"bin5_{pheno}"][short_code] = tupVariable( + _name("bin5_", pheno, short_code, "", self.names), + Bounds(0, 1), + "binary", + ) + variables.extend( + [ + self.variables[f"bin1_{pheno}"][short_code], + self.variables[f"bin2_{pheno}"][short_code], + self.variables[f"bin3_{pheno}"][short_code], + self.variables[f"bin4_{pheno}"][short_code], + self.variables[f"bin5_{pheno}"][short_code], + ] + ) + return variables + + def define_b_cons(self, pheno, short_code, timestep, biomass_coefs): + biomass_coefs = biomass_coefs or biomass_partition_coefs[-1] + # define the partitioned biomass groups + ## b_n{pheno,t} <= coef*b_tot{pheno,t} + self.constraints["b1c_" + pheno][short_code][timestep] = tupConstraint( + _name("b1c_", pheno, short_code, timestep, self.names), + Bounds(0, None), + { + "elements": [ + { + "elements": [ + biomass_coefs[0], + self.variables["b_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1, + self.variables["b1_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + self.constraints["b2c_" + pheno][short_code][timestep] = tupConstraint( + _name("b2c_", pheno, short_code, timestep, self.names), + Bounds(0, None), + { + "elements": [ + { + "elements": [ + biomass_coefs[1], + self.variables["b_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1, + self.variables["b2_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + self.constraints["b3c_" + pheno][short_code][timestep] = tupConstraint( + _name("b3c_", pheno, short_code, timestep, self.names), + Bounds(0, None), + { + "elements": [ + { + "elements": [ + biomass_coefs[2], + self.variables["b_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1, + self.variables["b3_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + self.constraints["b4c_" + pheno][short_code][timestep] = tupConstraint( + _name("b4c_", pheno, short_code, timestep, self.names), + Bounds(0, None), + { + "elements": [ + { + "elements": [ + biomass_coefs[3], + self.variables["b_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1, + self.variables["b4_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + self.constraints["b5c_" + pheno][short_code][timestep] = tupConstraint( + _name("b5c_", pheno, short_code, timestep, self.names), + Bounds(0, None), + { + "elements": [ + { + "elements": [ + biomass_coefs[4], + self.variables["b_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1, + self.variables["b5_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + + # define the comprehensive biomass constraints + ## coef*b{pheno,t} - b_n{pheno,t} - 1000*bin_n{pheno} <= 0 + self.constraints["b1c_control_" + pheno][short_code][timestep] = tupConstraint( + _name("b1c_control_", pheno, short_code, timestep, self.names), + Bounds(None, 0), + { + "elements": [ + { + "elements": [ + biomass_coefs[0], + self.variables["b_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1, + self.variables["b1_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1000, + self.variables[f"bin1_{pheno}"][short_code].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + self.constraints["b2c_control_" + pheno][short_code][timestep] = tupConstraint( + _name("b2c_control_", pheno, short_code, timestep, self.names), + Bounds(None, 0), + { + "elements": [ + { + "elements": [ + biomass_coefs[1], + self.variables["b_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1, + self.variables["b2_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1000, + self.variables[f"bin2_{pheno}"][short_code].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + self.constraints["b3c_control_" + pheno][short_code][timestep] = tupConstraint( + _name("b3c_control_", pheno, short_code, timestep, self.names), + Bounds(None, 0), + { + "elements": [ + { + "elements": [ + biomass_coefs[2], + self.variables["b_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1, + self.variables["b3_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1000, + self.variables[f"bin3_{pheno}"][short_code].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + self.constraints["b4c_control_" + pheno][short_code][timestep] = tupConstraint( + _name("b4c_control_", pheno, short_code, timestep, self.names), + Bounds(None, 0), + { + "elements": [ + { + "elements": [ + biomass_coefs[3], + self.variables["b_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1, + self.variables["b4_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1000, + self.variables[f"bin4_{pheno}"][short_code].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + self.constraints["b5c_control_" + pheno][short_code][timestep] = tupConstraint( + _name("b5c_control_", pheno, short_code, timestep, self.names), + Bounds(None, 0), + { + "elements": [ + { + "elements": [ + biomass_coefs[4], + self.variables["b_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1, + self.variables["b5_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1000, + self.variables[f"bin5_{pheno}"][short_code].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + + # define the binary constraints + ## b_n{pheno,t} <= 1000 - 1000*bin_n{pheno} + self.constraints["bin1c_" + pheno][short_code][timestep] = tupConstraint( + _name("bin1c_", pheno, short_code, timestep, self.names), + Bounds(0, None), + { + "elements": [ + 1000, + { + "elements": [ + -1, + self.variables["b1_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1000, + self.variables[f"bin1_{pheno}"][short_code].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + self.constraints["bin2c_" + pheno][short_code][timestep] = tupConstraint( + _name("bin2c_", pheno, short_code, timestep, self.names), + Bounds(0, None), + { + "elements": [ + 1000, + { + "elements": [ + -1, + self.variables["b2_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1000, + self.variables[f"bin2_{pheno}"][short_code].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + self.constraints["bin3c_" + pheno][short_code][timestep] = tupConstraint( + _name("bin3c_", pheno, short_code, timestep, self.names), + Bounds(0, None), + { + "elements": [ + 1000, + { + "elements": [ + -1, + self.variables["b3_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1000, + self.variables[f"bin3_{pheno}"][short_code].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + self.constraints["bin4c_" + pheno][short_code][timestep] = tupConstraint( + _name("bin4c_", pheno, short_code, timestep, self.names), + Bounds(0, None), + { + "elements": [ + 1000, + { + "elements": [ + -1, + self.variables["b4_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1000, + self.variables[f"bin4_{pheno}"][short_code].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + self.constraints["bin5c_" + pheno][short_code][timestep] = tupConstraint( + _name("bin5c_", pheno, short_code, timestep, self.names), + Bounds(0, None), + { + "elements": [ + 1000, + { + "elements": [ + -1, + self.variables["b5_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1000, + self.variables[f"bin5_{pheno}"][short_code].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + + # load the constraints to the model + return [ + self.constraints["b1c_" + pheno][short_code][timestep], + self.constraints["b2c_" + pheno][short_code][timestep], + self.constraints["b3c_" + pheno][short_code][timestep], + self.constraints["b4c_" + pheno][short_code][timestep], + self.constraints["b5c_" + pheno][short_code][timestep], + self.constraints["b1c_control_" + pheno][short_code][timestep], + self.constraints["b2c_control_" + pheno][short_code][timestep], + self.constraints["b3c_control_" + pheno][short_code][timestep], + self.constraints["b4c_control_" + pheno][short_code][timestep], + self.constraints["b5c_control_" + pheno][short_code][timestep], + self.constraints["bin1c_" + pheno][short_code][timestep], + self.constraints["bin2c_" + pheno][short_code][timestep], + self.constraints["bin3c_" + pheno][short_code][timestep], + self.constraints["bin4c_" + pheno][short_code][timestep], + self.constraints["bin5c_" + pheno][short_code][timestep], + ] + + def initialize_vars_cons(self, pheno, short_code): + # cvt and cvf + self.variables["cvt_" + pheno] = {} + self.variables["cvf_" + pheno] = {} + self.variables["cvt_" + pheno][short_code] = {} + self.variables["cvf_" + pheno][short_code] = {} + # total biomass and growth + self.variables["b_" + pheno] = {} + self.variables["g_" + pheno] = {} + self.variables["b_" + pheno][short_code] = {} + self.variables["g_" + pheno][short_code] = {} + self.constraints["gc_" + pheno] = {} + self.constraints["cvc_" + pheno] = {} + self.constraints["gc_" + pheno][short_code] = {} + self.constraints["cvc_" + pheno][short_code] = {} + # partitioned biomasses + self.variables["b1_" + pheno] = {} + self.variables["b2_" + pheno] = {} + self.variables["b3_" + pheno] = {} + self.variables["b4_" + pheno] = {} + self.variables["b5_" + pheno] = {} + self.variables["b1_" + pheno][short_code] = {} + self.variables["b2_" + pheno][short_code] = {} + self.variables["b3_" + pheno][short_code] = {} + self.variables["b4_" + pheno][short_code] = {} + self.variables["b5_" + pheno][short_code] = {} + ## biomass binary variables + self.variables[f"bin1_{pheno}"] = {} + self.variables[f"bin2_{pheno}"] = {} + self.variables[f"bin3_{pheno}"] = {} + self.variables[f"bin4_{pheno}"] = {} + self.variables[f"bin5_{pheno}"] = {} + self.variables[f"bin1_{pheno}"][short_code] = {} + self.variables[f"bin2_{pheno}"][short_code] = {} + self.variables[f"bin3_{pheno}"][short_code] = {} + self.variables[f"bin4_{pheno}"][short_code] = {} + self.variables[f"bin5_{pheno}"][short_code] = {} + ## biomass partition constraints + self.constraints["b1c_" + pheno] = {} + self.constraints["b2c_" + pheno] = {} + self.constraints["b3c_" + pheno] = {} + self.constraints["b4c_" + pheno] = {} + self.constraints["b5c_" + pheno] = {} + self.constraints["b1c_" + pheno][short_code] = {} + self.constraints["b2c_" + pheno][short_code] = {} + self.constraints["b3c_" + pheno][short_code] = {} + self.constraints["b4c_" + pheno][short_code] = {} + self.constraints["b5c_" + pheno][short_code] = {} + self.constraints["b1c_control_" + pheno] = {} + self.constraints["b2c_control_" + pheno] = {} + self.constraints["b3c_control_" + pheno] = {} + self.constraints["b4c_control_" + pheno] = {} + self.constraints["b5c_control_" + pheno] = {} + self.constraints["b1c_control_" + pheno][short_code] = {} + self.constraints["b2c_control_" + pheno][short_code] = {} + self.constraints["b3c_control_" + pheno][short_code] = {} + self.constraints["b4c_control_" + pheno][short_code] = {} + self.constraints["b5c_control_" + pheno][short_code] = {} + self.constraints[f"binc_{pheno}"] = {} + self.constraints[f"binc_{pheno}"][short_code] = {} + self.constraints["bin1c_" + pheno] = {} + self.constraints["bin2c_" + pheno] = {} + self.constraints["bin3c_" + pheno] = {} + self.constraints["bin4c_" + pheno] = {} + self.constraints["bin5c_" + pheno] = {} + self.constraints["bin1c_" + pheno][short_code] = {} + self.constraints["bin2c_" + pheno][short_code] = {} + self.constraints["bin3c_" + pheno][short_code] = {} + self.constraints["bin4c_" + pheno][short_code] = {} + self.constraints["bin5c_" + pheno][short_code] = {} + + def get_timestep_bin(self, timestep): + if timestep < self.first: + return 0 + elif timestep < self.second: + return 1 + elif timestep < self.third: + return 2 + elif timestep < self.fourth: + return 3 + return 4 + + def define_problem( + self, + parameters=None, + mets_to_track=None, + rel_final_conc=None, + zero_start=None, + abs_final_conc=None, + data_timesteps=None, + export_zip_name: str = None, + export_parameters: bool = True, + export_lp: str = "CommPhitting.lp", + primal_values=None, + biomass_coefs=None, + requisite_biomass: dict = None, + biolog_simulation=False, + export_phenotype_profiles=True, + ): + # parse the growth data + growth_tup = FBAHelper.parse_df(self.data_df, False) + self.phenotypes = list(self.fluxes_tup.columns) + self.phenotypes.extend( + [ + signal_species(signal) + "_stationary" + for signal in growth_tup.columns + if (":" in signal and "OD" not in signal) + ] + ) + self.species_list = [ + signal_species(signal) for signal in growth_tup.columns if ":" in signal + ] + num_sorted = np.sort(np.array([int(obj[1:]) for obj in set(growth_tup.index)])) + # TODO - short_codes must be distinguished for different conditions + unique_short_codes = [ + f"{growth_tup.index[0][0]}{num}" for num in map(str, num_sorted) + ] + full_times = growth_tup.values[:, growth_tup.columns.index("Time (s)")] + self.times = { + short_code: trial_contents(short_code, growth_tup.index, full_times) + for short_code in unique_short_codes + } + average_time_series = np.mean(list(self.times.values()), axis=0) + points = len(average_time_series) + self.first, self.second, self.third, self.fourth = ( + int(points * 0.1), + int(points * 0.25), + int(points * 0.45), + int(points * 0.7), + ) + self.time_ranges = { + 0: average_time_series[: self.first], + 1: average_time_series[self.first : self.second], + 2: average_time_series[self.second : self.third], + 3: average_time_series[self.third : self.fourth], + 4: average_time_series[self.fourth :], + } + + # define default values + # TODO render bcv and cvmin dependent upon temperature, and possibly trained on Carlson's data + parameters, data_timesteps = parameters or {}, data_timesteps or {} + self.parameters["data_timestep_hr"] = ( + np.mean(np.diff(np.array(list(self.times.values())).flatten())) / hour + if not hasattr(self, "data_timestep_hr") + else self.data_timestep_hr + ) + self.parameters.update( + { + "timestep_hr": self.parameters["data_timestep_hr"], + "cvct": 0.01, + "cvcf": 0.01, + "bcv": 0.01, + "cvmin": 0.01, + "kcat": 0.33, + "diffpos": 1, + "diffneg": 1, # coefficients that weight difference between experimental and predicted biomass + "stationary": 10, # the penalty coefficient for the stationary phenotype + } + ) + self.parameters.update(parameters) + # distribute kcat values to all phenotypes of all species and update from previous simulations where necessary + self.parameters.update( + self._universalize(self.parameters, "kcat", exclude=["stationary"]) + ) + if primal_values is not None: + for species, content in self.parameters["kcat"].items(): + if species not in primal_values: + continue + for pheno, content2 in content.items(): + if pheno not in primal_values[species]: + continue + for time, val in content2.items(): + if time not in primal_values[species][pheno]: + continue + self.parameters["kcat"][species][pheno][time] = val + print(self.parameters["kcat"]) + # define the metabolites that are tracked, exchanged, and not available in the media + # TODO the default zero_start logic appears to be incorrect + self.zero_start = zero_start or [ + met + for met in self.consumed_mets + if (met not in self.carbon_conc or self.carbon_conc[met] == 0) + ] + self.rel_final_conc = rel_final_conc or { + met: 0.1 + for met, concs in self.carbon_conc.items() + if any( + [concs[short_code] > 0 for short_code in self.data_df.index.unique()] + ) + and met not in self.zero_start + } + self.abs_final_conc = abs_final_conc or {} + if mets_to_track: + self.mets_to_track = mets_to_track + elif not isinstance(rel_final_conc, dict): + self.mets_to_track = self.fluxes_tup.index + else: + self.mets_to_track = list(self.rel_final_conc.keys()) + self.zero_start + print(self.mets_to_track) + + ts_to_delete = ( + {} + ) # {short_code: full_times for short_code in unique_short_codes} + if data_timesteps: # {short_code:[times]} + for short_code, times in data_timesteps.items(): + ts_to_delete[short_code] = set(list(range(len(full_times)))) - set( + times + ) + self.times[short_code] = np.delete( + self.times[short_code], list(ts_to_delete[short_code]) + ) + + # construct the problem + objective = tupObjective( + "minimize variance and phenotypic transitions", [], "min" + ) + constraints, variables, simulated_mets = [], [], [] + time_1 = process_time() + for exID in self.fluxes_tup.index: + if exID == "bio": + continue + met_id = re.search(r"(cpd\d{5})", exID).group() + met = self.msdb.compounds.get_by_id(met_id) + if "C" not in met.elements: + continue + concID = f"c_{met_id}_e0" + simulated_mets.append(met_id) + self.variables[concID] = {} + self.constraints["dcc_" + met_id] = {} + + # define the growth rate for each metabolite and concentrations + # TODO the MM parameters may be deletable once the binned kcat method is refined + if "Vmax" and "Km" in self.parameters: + self.parameters["Vmax"].update( + self._universalize(self.parameters["Vmax"], met_id) + ) + self.parameters["Km"].update( + self._universalize(self.parameters["Km"], met_id) + ) + for short_code in unique_short_codes: + self.variables[concID][short_code] = {} + self.constraints["dcc_" + met_id][short_code] = {} + timesteps = list(range(1, len(self.times[short_code]) + 1)) + for timestep in timesteps: + ## define the concentration variables + conc_var = tupVariable( + _name(concID, "", short_code, timestep, self.names) + ) + ## constrain initial time concentrations to the media or a large default + if timestep == timesteps[0]: + initial_val = None + if met_id in self.media_conc: + initial_val = self.media_conc[met_id] + if met_id in self.zero_start: + initial_val = 0 + if dict_keys_exists(self.carbon_conc, met_id, short_code): + initial_val = self.carbon_conc[met_id][short_code] + if initial_val is not None: + conc_var = conc_var._replace( + bounds=Bounds(initial_val, initial_val) + ) + if biolog_simulation: + conc_var = conc_var._replace(bounds=Bounds(1, None)) + ## mandate complete carbon consumption + elif timestep == timesteps[-1] and ( + met_id in self.rel_final_conc or met_id in self.abs_final_conc + ): + if met_id in self.rel_final_conc: + final_bound = ( + self.variables[concID][short_code][1].bounds.lb + * self.rel_final_conc[met_id] + ) + if ( + met_id in self.abs_final_conc + ): # this intentionally overwrites rel_final_conc + final_bound = self.abs_final_conc[met_id] + conc_var = conc_var._replace(bounds=Bounds(0, final_bound)) + if met_id in self.zero_start: + conc_var = conc_var._replace( + bounds=Bounds(final_bound, final_bound) + ) + self.variables[concID][short_code][timestep] = conc_var + variables.append(self.variables[concID][short_code][timestep]) + for pheno in self.phenotypes: + self.constraints["dbc_" + pheno] = { + short_code: {} for short_code in unique_short_codes + } + + # define growth and biomass variables and constraints + for pheno in self.phenotypes: + for short_code in unique_short_codes: + self.initialize_vars_cons(pheno, short_code) + timesteps = list(range(1, len(self.times[short_code]) + 1)) + nth_percentile_timestep = timesteps[int(0.90 * len(timesteps))] + penalty_range = np.linspace( + self.parameters["stationary"], + self.parameters["stationary"] / 10, + len(timesteps[nth_percentile_timestep:]), + ) + timestep_excess_count = 0 + for timestep in map(int, timesteps): + variables = self.define_b_vars( + pheno, short_code, timestep, variables + ) + if short_code not in self.constraints[f"binc_{pheno}"]: + self.constraints[f"binc_{pheno}"][short_code] = tupConstraint( + _name("binc_", pheno, short_code, "", self.names), + Bounds(0, 4), + { + "elements": [ + self.variables[f"bin1_{pheno}"][short_code].name, + self.variables[f"bin2_{pheno}"][short_code].name, + self.variables[f"bin3_{pheno}"][short_code].name, + self.variables[f"bin4_{pheno}"][short_code].name, + self.variables[f"bin5_{pheno}"][short_code].name, + ], + "operation": "Add", + }, + ) + constraints.append( + self.constraints[f"binc_{pheno}"][short_code] + ) + constraints.extend( + self.define_b_cons(pheno, short_code, timestep, biomass_coefs) + ) + + ## define the growth rate variable or primal value + species, phenotype = pheno.split("_") + self.variables["g_" + pheno][short_code][timestep] = tupVariable( + _name("g_", pheno, short_code, timestep, self.names) + ) + variables.append(self.variables["g_" + pheno][short_code][timestep]) + + if "stationary" in pheno: + weight = self.parameters["stationary"] + if timestep > nth_percentile_timestep: + weight = penalty_range[timestep_excess_count] + timestep_excess_count += 1 + objective.expr.extend( + [ + { + "elements": [ + { + "elements": [ + weight, + self.variables["b_" + pheno][ + short_code + ][timestep].name, + ], + "operation": "Mul", + } + ], + "operation": "Add", + } + ] + ) + continue + # the conversion rates to and from the stationary phase + self.variables["cvt_" + pheno][short_code][timestep] = tupVariable( + _name("cvt_", pheno, short_code, timestep, self.names), + Bounds(0, 100), + ) + self.variables["cvf_" + pheno][short_code][timestep] = tupVariable( + _name("cvf_", pheno, short_code, timestep, self.names), + Bounds(0, 100), + ) + variables.extend( + [ + self.variables["cvf_" + pheno][short_code][timestep], + self.variables["cvt_" + pheno][short_code][timestep], + ] + ) + + # cvt <= bcv*b_{pheno} + cvmin + self.constraints["cvc_" + pheno][short_code][timestep] = ( + tupConstraint( + _name("cvc_", pheno, short_code, timestep, self.names), + (0, None), + { + "elements": [ + { + "elements": [ + -1, + self.variables["cvt_" + pheno][short_code][ + timestep + ].name, + ], + "operation": "Mul", + } + ], + "operation": "Add", + }, + ) + ) + # biomass_term = [self.parameters['bcv']*b_value + self.parameters['cvmin']] if FBAHelper.isnumber(b_value) else [ + biomass_term = [ + self.parameters["cvmin"], + { + "elements": [ + self.parameters["bcv"], + self.variables["b_" + pheno][short_code][timestep].name, + ], + "operation": "Mul", + }, + ] + self.constraints["cvc_" + pheno][short_code][timestep].expr[ + "elements" + ].extend(biomass_term) + + # g_{pheno} = b_{pheno}*v_{pheno} + b_values = [ + self.variables["b1_" + pheno][short_code][timestep].name, + self.variables["b2_" + pheno][short_code][timestep].name, + self.variables["b3_" + pheno][short_code][timestep].name, + self.variables["b4_" + pheno][short_code][timestep].name, + self.variables["b5_" + pheno][short_code][timestep].name, + ] + self.constraints["gc_" + pheno][short_code][timestep] = ( + tupConstraint( + name=_name("gc_", pheno, short_code, timestep, self.names), + expr={ + "elements": [ + *[ + { + "elements": [ + -self.parameters["kcat"][species][ + phenotype + ], + b, + ], + "operation": "Mul", + } + for b in b_values + ], + self.variables["g_" + pheno][short_code][ + timestep + ].name, + ], + "operation": "Add", + }, + ) + ) + + constraints.extend( + [ + self.constraints["cvc_" + pheno][short_code][timestep], + self.constraints["gc_" + pheno][short_code][timestep], + ] + ) + # self.constraints["binTot_" + pheno][short_code]]) + + # define the concentration constraint + half_dt = self.parameters["data_timestep_hr"] / 2 + time_2 = process_time() + print( + f"Done with concentrations and biomass loops: {(time_2 - time_1) / 60} min" + ) + for r_index, met in enumerate(self.fluxes_tup.index): + met_id = _met_id_parser(met) + if met_id not in simulated_mets: + continue + concID = f"c_{met_id}_e0" + for short_code in unique_short_codes: + timesteps = list(range(1, len(self.times[short_code]) + 1)) + for timestep in timesteps[:-1]: + # c_{met} + dt/2*sum_k^K(n_{k,met} * (g_{pheno}+g+1_{pheno})) = c+1_{met} + next_timestep = timestep + 1 + growth_phenos = [ + [ + self.variables["g_" + pheno][short_code][ + next_timestep + ].name, + self.variables["g_" + pheno][short_code][timestep].name, + ] + for pheno in self.fluxes_tup.columns + ] + self.constraints["dcc_" + met_id][short_code][timestep] = ( + tupConstraint( + name=_name( + "dcc_", met_id, short_code, timestep, self.names + ), + expr={ + "elements": [ + self.variables[concID][short_code][timestep].name, + { + "elements": [ + -1, + self.variables[concID][short_code][ + next_timestep + ].name, + ], + "operation": "Mul", + }, + *OptlangHelper.dot_product( + growth_phenos, + heuns_coefs=half_dt + * self.fluxes_tup.values[r_index], + ), + ], + "operation": "Add", + }, + ) + ) + constraints.append( + self.constraints["dcc_" + met_id][short_code][timestep] + ) + + # define the conversion variables of every signal for every phenotype + # for signal in growth_tup.columns[2:]: + # for pheno in self.fluxes_tup.columns: + # conversion_name = "_".join([signal, pheno, "__conversion"]) + # self.variables[conversion_name] = tupVariable(conversion_name) + # variables.append(self.variables[conversion_name]) + + time_3 = process_time() + print(f"Done with DCC loop: {(time_3 - time_2) / 60} min") + species_phenos = {} + self.conversion_bounds = [5e-6, 50] + for index, org_signal in enumerate(growth_tup.columns[2:]): + # signal = org_signal.split(":")[1] + signal = org_signal.replace(":", "|") + species = signal_species(org_signal) + species_phenos[species] = { + None if "OD" in species else f"{species}_stationary" + } + signal_column_index = index + 2 + data_timestep = 1 + self.variables[signal + "|conversion"] = tupVariable( + signal + "|conversion", bounds=Bounds(*self.conversion_bounds) + ) + variables.append(self.variables[signal + "|conversion"]) + + self.variables[signal + "|bio"] = {} + self.variables[signal + "|diffpos"] = {} + self.variables[signal + "|diffneg"] = {} + self.variables["g_" + species] = {} + self.constraints[signal + "|bioc"] = {} + self.constraints[signal + "|diffc"] = {} + self.constraints["gc_" + species] = {} + self.constraints["totVc_" + species] = {} + self.constraints["totGc_" + species] = {} + self.constraints[signal + "|bio_finalc"] = {} + for short_code in unique_short_codes: + self.variables[signal + "|bio"][short_code] = {} + self.variables[signal + "|diffpos"][short_code] = {} + self.variables[signal + "|diffneg"][short_code] = {} + self.variables["g_" + species][short_code] = {} + self.constraints[signal + "|bioc"][short_code] = {} + self.constraints[signal + "|diffc"][short_code] = {} + self.constraints["gc_" + species][short_code] = {} + self.constraints["totVc_" + species][short_code] = {} + self.constraints["totGc_" + species][short_code] = {} + # self.constraints[signal + '|bio_finalc'][short_code] = {} + # the value entries are matched to only the timesteps that are condoned by data_timesteps + values_slice = trial_contents( + short_code, growth_tup.index, growth_tup.values + ) + if ts_to_delete: + values_slice = np.delete( + values_slice, list(ts_to_delete[short_code]), axis=0 + ) + timesteps = list(range(1, len(values_slice) + 1)) + # the last timestep is omitted since Heun's method in the modelled biomass + ## requires a future timestep, which does not exist for the last timestep + for timestep in timesteps[:-1]: + ## the user timestep and data timestep must be synchronized + if ( + int(timestep) * self.parameters["timestep_hr"] + < data_timestep * self.parameters["data_timestep_hr"] + ): + print( + f"Skipping timestep {timestep} that does not align with the user's timestep" + ) + continue + data_timestep += 1 + if data_timestep > int( + self.times[short_code][-1] / self.parameters["data_timestep_hr"] + ): + print( + f"The user-defined time exceeds the simulation time, so the DBC & diff loop is broken." + ) + break + next_timestep = int(timestep) + 1 + ## the phenotype transition terms are aggregated + total_biomass, signal_sum, from_sum, to_sum = [], [], [], [] + for pheno_index, pheno in enumerate(self.phenotypes): + ### define the collections of signal and pheno terms + if species in pheno or "OD" in signal: + # if not FBAHelper.isnumber(b_values[pheno][short_code][timestep]): + signal_sum.append( + { + "operation": "Mul", + "elements": [ + -1, + self.variables["b_" + pheno][short_code][ + timestep + ].name, + ], + } + ) + # else: + # signal_sum.append(-b_values[pheno][short_code][timestep]) + ### total_biomass.append(self.variables["b_"+pheno][short_code][timestep].name) + if all( + [ + "OD" not in signal, + species in pheno, + "stationary" not in pheno, + ] + ): + species_phenos[species].add(pheno) + from_sum.append( + { + "operation": "Mul", + "elements": [ + -1, + self.variables["cvf_" + pheno][short_code][ + timestep + ].name, + ], + } + ) + to_sum.append( + self.variables["cvt_" + pheno][short_code][ + timestep + ].name + ) + for pheno in species_phenos[species]: + if "OD" in signal: + continue + # print(pheno, timestep, b_values[pheno][short_code][timestep], b_values[pheno][short_code][next_timestep]) + if "stationary" in pheno: + # b_{phenotype} - sum_k^K(es_k*cvf) + sum_k^K(pheno_bool*cvt) = b+1_{phenotype} + self.constraints["dbc_" + pheno][short_code][timestep] = ( + tupConstraint( + name=_name( + "dbc_", pheno, short_code, timestep, self.names + ), + expr={ + "elements": [*from_sum, *to_sum], + "operation": "Add", + }, + ) + ) + else: + # b_{phenotype} + dt/2*(g_{phenotype} + g+1_{phenotype}) + cvf-cvt = b+1_{phenotype} + self.constraints["dbc_" + pheno][short_code][timestep] = ( + tupConstraint( + name=_name( + "dbc_", pheno, short_code, timestep, self.names + ), + expr={ + "elements": [ + self.variables["cvf_" + pheno][short_code][ + timestep + ].name, + { + "elements": [ + half_dt, + self.variables["g_" + pheno][ + short_code + ][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + half_dt, + self.variables["g_" + pheno][ + short_code + ][next_timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + -1, + self.variables["cvt_" + pheno][ + short_code + ][timestep].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + ) + # if not FBAHelper.isnumber(self.variables['b_' + pheno][short_code][timestep]): + biomass_term = [ + self.variables["b_" + pheno][short_code][timestep].name, + { + "elements": [ + -1, + self.variables["b_" + pheno][short_code][ + next_timestep + ].name, + ], + "operation": "Mul", + }, + ] + # else: + # biomass_term = [b_values[pheno][short_code][timestep]-b_values[pheno][short_code][next_timestep]] + self.constraints["dbc_" + pheno][short_code][timestep].expr[ + "elements" + ].extend(biomass_term) + constraints.append( + self.constraints["dbc_" + pheno][short_code][timestep] + ) + + if not requisite_biomass or any( + [ + timestep != timesteps[-2], + signal not in requisite_biomass[short_code], + ] + ): + self.variables[signal + "|bio"][short_code][timestep] = ( + tupVariable( + _name(signal, "|bio", short_code, timestep, self.names) + ) + ) + else: + biomass_flux = requisite_biomass[short_code][signal]["bio"] + estimated_biomass = biomass_flux # * int(timestep)*self.parameters['data_timestep_hr'] + self.variables[signal + "|bio"][short_code][timestep] = ( + tupVariable( + _name(signal, "|bio", short_code, timestep, self.names), + Bounds(estimated_biomass, None), + ) + ) + self.variables[signal + "|diffpos"][short_code][timestep] = ( + tupVariable( + _name(signal, "|diffpos", short_code, timestep, self.names), + Bounds(0, 100), + ) + ) + self.variables[signal + "|diffneg"][short_code][timestep] = ( + tupVariable( + _name(signal, "|diffneg", short_code, timestep, self.names), + Bounds(0, 100), + ) + ) + variables.extend( + [ + self.variables[signal + "|bio"][short_code][timestep], + self.variables[signal + "|diffpos"][short_code][timestep], + self.variables[signal + "|diffneg"][short_code][timestep], + ] + ) + + # {signal}__conversion*datum = {signal}__bio + # TODO - the conversion variable must be a constant for BIOLOG conditions + self.constraints[signal + "|bioc"][short_code][timestep] = ( + tupConstraint( + name=_name( + signal, "|bioc", short_code, timestep, self.names + ), + expr={ + "elements": [ + { + "elements": [ + -1, + self.variables[signal + "|bio"][short_code][ + timestep + ].name, + ], + "operation": "Mul", + }, + { + "elements": [ + self.variables[signal + "|conversion"].name, + values_slice[timestep, signal_column_index], + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + ) + constraints.append( + self.constraints[signal + "|bioc"][short_code][timestep] + ) + + # {speces}_bio + {signal}_diffneg-{signal}_diffpos = sum_k^K(es_k*b_{phenotype}) + self.constraints[signal + "|diffc"][short_code][timestep] = ( + tupConstraint( + name=_name( + signal, "|diffc", short_code, timestep, self.names + ), + expr={ + "elements": [ + self.variables[signal + "|bio"][short_code][ + timestep + ].name, + self.variables[signal + "|diffneg"][short_code][ + timestep + ].name, + { + "elements": [ + -1, + self.variables[signal + "|diffpos"][ + short_code + ][timestep].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + }, + ) + ) + if all([isinstance(val, dict) for val in signal_sum]): + self.constraints[signal + "|diffc"][short_code][timestep].expr[ + "elements" + ].extend(signal_sum) + else: + raise ValueError( + f"The {signal_sum} value has unexpected contents." + ) + constraints.append( + self.constraints[signal + "|diffc"][short_code][timestep] + ) + + objective.expr.extend( + [ + { + "elements": [ + { + "elements": [ + self.parameters["diffpos"], + self.variables[f"{signal}|diffpos"][ + short_code + ][timestep].name, + ], + "operation": "Mul", + }, + { + "elements": [ + self.parameters["diffneg"], + self.variables[f"{signal}|diffneg"][ + short_code + ][timestep].name, + ], + "operation": "Mul", + }, + ], + "operation": "Add", + } + ] + ) + + time_4 = process_time() + print(f"Done with the DBC & diffc loop: {(time_4 - time_3) / 60} min") + + # construct the problem + self.problem = OptlangHelper.define_model( + "CommPhitting model", variables, constraints, objective, True + ) + self.hdf5_name = export_lp.replace(".lp", ".h5") + self.hdf5_file = File(self.hdf5_name, "w") + time_5 = process_time() + print( + f"Done with constructing the {type(self.problem)} model: {(time_5 - time_4) / 60} min" + ) + + # export contents + if export_phenotype_profiles: + phenotype_profiles_name = "phenotype_profiles.tsv" + self.fluxes_df.to_csv(phenotype_profiles_name, sep="\t") + self.zipped_output.append(phenotype_profiles_name) + if export_parameters: + parameter_name = "parameters.tsv" + DataFrame( + data=list(self.parameters.values()), + index=list(self.parameters.keys()), + columns=["values"], + ).to_csv(parameter_name, sep="\t") + self.zipped_output.append(parameter_name) + if export_lp: + if re.search(r"(\\\\/)", export_lp): + os.makedirs(os.path.dirname(export_lp), exist_ok=True) + with open(export_lp, "w") as lp: + lp.write(self.problem.to_lp()) + model_name = "CommPhitting.json" + _export_model_json(self.problem.to_json(), model_name) + self.zipped_output.extend([export_lp, model_name]) + if export_zip_name: + self.zip_name = export_zip_name + sleep(2) + with ZipFile(self.zip_name, "a", compression=ZIP_LZMA) as zp: + for file in self.zipped_output: + zp.write(file) + os.remove(file) + self.zipped_output.remove(file) + time_6 = process_time() + print(f"Done exporting the content: {(time_6 - time_5) / 60} min") + + def compute( + self, + graphs: list = None, + export_zip_name=None, + figures_zip_name=None, + publishing=False, + primals_export_path: str = "primal_values.json", + remove_empty_plots=False, + ): + print("starting optimization") + time1 = process_time() + self.values = {} + solution = self.problem.optimize() + timesteps = min(list(map(len, self.times.values()))) + fit_quality = self.problem.objective.value / timesteps + print(f"The optimization fit quality is {fit_quality}") + if "parameters.tsv" in self.zipped_output: + self.parameters["fit"] = fit_quality + parameter_name = "parameters.tsv" + DataFrame( + data=list(self.parameters.values()), + index=list(self.parameters.keys()), + columns=["values"], + ).to_csv(parameter_name, sep="\t") + with ZipFile(self.zip_name, "a", compression=ZIP_LZMA) as zp: + for file in self.zipped_output: + zp.write(file) + os.remove(file) + + # TODO approximate a threshold of good fits, and trigger black box optimization for bad fits + ## that iteratively adjust parameters until the fit metric surmounts the threshold. + + # categorize the primal values by trial and time + if "optimal" not in solution: + raise FeasibilityError( + f"The solution is sub-optimal, with a(n) {solution} status." + ) + if all(np.array(list(self.problem.primal_values.values())) == 0): + raise NoFluxError("The simulation lacks any flux.") + for variable, value in self.problem.primal_values.items(): + if "v_" in variable: + self.values[variable] = value + elif "conversion" in variable or re.search(r"(bin\d)", variable): + self.values[short_code].update({variable: value}) + if value in self.conversion_bounds: + warnings.warn( + f"The conversion factor {value} optimized to a bound, which may be " + f"indicative of an error, such as improper kinetic rates." + ) + else: + basename, short_code, timestep = variable.split("-") + time_hr = int(timestep) * self.parameters["data_timestep_hr"] + self.values[short_code] = self.values.get(short_code, {}) + self.values[short_code][basename] = self.values[short_code].get( + basename, {} + ) + self.values[short_code][basename][time_hr] = value + + # export the processed primal values for graphing + # with open(primals_export_path, 'w') as out: + # json.dump(self.values, out, indent=3) + # if not export_zip_name and hasattr(self, 'zip_name'): + # export_zip_name = self.zip_name + # if export_zip_name: + # with ZipFile(export_zip_name, 'a', compression=ZIP_LZMA) as zp: + # zp.write(primals_export_path) + # os.remove(primals_export_path) + # visualize the specified information + time2 = process_time() + if graphs: + self.graph( + graphs, + export_zip_name=figures_zip_name or export_zip_name, + publishing=publishing, + remove_empty_plots=remove_empty_plots, + ) + + # parse the primal values + values_df = DataFrame(self.values) + values_index = values_df.index.tolist() + for col in values_df.columns: + trial_values = values_df[col].tolist() + ## process the times + times = [list(ele.keys()) for ele in trial_values if isinstance(ele, dict)] + max_time = max(list(map(len, times))) + for max_time_series in times: + if len(max_time_series) == max_time: + break + trial_path = f"results/primals/{col}/" + self.hdf5_file.create_dataset(f"{trial_path}/times", data=max_time_series) + ## process the data values + for index, ele in enumerate(trial_values): + dataset_name = f"{trial_path}/{values_index[index]}" + if FBAHelper.isnumber(ele): + self.hdf5_file.create_dataset(dataset_name, data=[float(ele)]) + elif isinstance(ele, dict): + self.hdf5_file.create_dataset( + dataset_name, data=list(map(float, ele.values())) + ) + self.hdf5_file[dataset_name].attrs["full_time"] = ( + len(ele.values()) == max_time + ) + + self.hdf5_file.close() + with ZipFile(self.zip_name, "a", compression=ZIP_LZMA) as zp: + zp.write(self.hdf5_name) + os.remove(self.hdf5_name) + + time3 = process_time() + print(f"Optimization completed in {(time2-time1)/60} minutes") + print(f"Graphing completed in {(time3-time2)/60} minutes") + + def load_model( + self, + mscomfit_json_path: str = None, + zip_name: str = None, + model_to_load: dict = None, + ): + if zip_name: + with ZipFile(zip_name, "r") as zp: + zp.extract(mscomfit_json_path) + if mscomfit_json_path: + with open(mscomfit_json_path, "r") as mscmft: + return json.load(mscmft) + if model_to_load: + self.problem = Model.from_json(model_to_load) + + @staticmethod + def assign_values(param, var, next_dimension, kcat=True): + dic = {var: {}} + for dim1, dim2_list in next_dimension.items(): + if isinstance(dim2_list, dict): + dic[var].update(CommPhitting.assign_values(param, dim1, dim2_list)) + else: + if kcat: + dic[var][dim1] = param + else: + dic[var][dim1] = {dim2: param for dim2 in dim2_list} + return dic + + def _universalize(self, param, var, next_dimension=None, exclude=None, tsBin=False): + if not next_dimension: + next_dimension = {} + for organism in self.fluxes_tup.columns: + species, pheno = organism.split("_") + if pheno in exclude: + continue + if not tsBin: + if species in next_dimension: + next_dimension[species].append(pheno) + else: + next_dimension[species] = [pheno] + else: + if species in next_dimension: + next_dimension[species].update({pheno: self.time_ranges}) + else: + next_dimension[species] = {pheno: self.time_ranges} + if FBAHelper.isnumber(param): + return CommPhitting.assign_values(param, var, next_dimension) + elif FBAHelper.isnumber(param[var]): + return CommPhitting.assign_values(param[var], var, next_dimension) + elif isinstance(param[var], dict): + return { + var: { + dim1: {dim2: param[var][dim1] for dim2 in dim2_list} + for dim1, dim2_list in next_dimension.items() + } + } + else: + logger.critical( + f"The param (with keys {dic_keys(param)}) and var {var} are not amenable" + " with the parameterizing a universal value." + ) + # {short_code: {list(timestep_info.keys())[0]: find_dic_number(param)} for short_code, timestep_info in variable.items()}} + + def adjust_color(self, color, amount=0.5): + """ + Lightens the given color by multiplying (1-luminosity) by the given amount. + Input can be matplotlib color string, hex string, or RGB tuple. + + Examples: + >> lighten_color('g', 0.3) + >> lighten_color('#F034A3', 0.6) + >> lighten_color((.3,.55,.1), 0.5) + """ + import colorsys + import matplotlib.colors as mc + + try: + c = mc.cnames[color] + except: + c = color + c = colorsys.rgb_to_hls(*mc.to_rgb(c)) + return colorsys.hls_to_rgb(c[0], max(0, min(1, amount * c[1])), c[2]) + + def _add_plot( + self, + ax, + labels, + label, + basename, + trial, + x_axis_split, + linestyle="solid", + scatter=False, + color=None, + xs=None, + ys=None, + ): + labels.append(label or basename.split("-")[-1]) + xs = ( + xs + if xs is not None + else list(map(float, self.values[trial][basename].keys())) + ) + ys = ( + ys + if ys is not None + else list(map(float, self.values[trial][basename].values())) + ) + if scatter: + ax.scatter(xs, ys, s=10, label=labels[-1], color=color or None) + else: + ax.plot(xs, ys, label=labels[-1], linestyle=linestyle, color=color or None) + ax.set_xticks(list(map(int, xs))[::x_axis_split]) + return ax, labels + + def graph( + self, + graphs, + primal_values_filename: str = None, + primal_values_zip_path: str = None, + export_zip_name: str = None, + data_timestep_hr: float = 0.163, + publishing: bool = False, + title: str = None, + remove_empty_plots: bool = False, + ): + print(export_zip_name) + # define the default timestep ratio as 1 + data_timestep_hr = self.parameters.get("data_timestep_hr", data_timestep_hr) + timestep_ratio = data_timestep_hr / self.parameters.get( + "timestep_hr", data_timestep_hr + ) + if primal_values_filename: + if primal_values_zip_path: + with ZipFile(primal_values_zip_path, "r") as zp: + zp.extract(primal_values_filename) + with open(primal_values_filename, "r", encoding="utf-8") as primal: + self.values = json.load(primal) + + # plot the content for desired trials + x_axis_split = int(3 / data_timestep_hr / timestep_ratio) + self.plots = set() + contents = {"biomass": "b_", "all_biomass": "b_", "growth": "g_", "conc": "c_"} + mM_threshold = 1e-3 + for graph_index, graph in enumerate(graphs): + content = contents.get(graph["content"], graph["content"]) + y_label = "Variable value" + x_label = r"Time ($hr$)" + if any([x in graph["content"] for x in ["biomass", "OD"]]): + total_biomasses = {name: [] for name in self.species_list} + total_biomasses.update({"OD": []}) + if "species" not in graph: + graph["species"] = self.species_list + if "biomass" in graph["content"]: + y_label = r"Biomass ($\frac{g}{L}$)" + elif "growth" in graph["content"]: + y_label = r"Biomass growth ($\frac{g}{hr}$)" + graph["experimental_data"] = graph.get("experimental_data", False) + if "painting" not in graph: + graph["painting"] = { + "OD": { + "color": "blue", + "linestyle": "solid", + "name": "Total biomass", + }, + "ecoli": {"color": "red", "linestyle": "dashed", "name": "E. coli"}, + "pf": { + "color": "green", + "linestyle": "dotted", + "name": "P. fluorescens", + }, + } + graph["parsed"] = graph.get("parsed", False) + if "phenotype" in graph and graph["phenotype"] == "*": + if "species" not in graph: + graph["species"] = self.species_list + graph["phenotype"] = set( + [ + pheno.split("_")[-1] + for pheno in self.phenotypes + if pheno.split("_")[0] in graph["species"] + ] + ) + # TODO - a species-resolved option must be developed for the paper figure + if "species" in graph and graph["species"] == "*": + graph["species"] = self.species_list + elif content == "c_" and "mets" not in graph: + print(self.mets_to_track) + graph["mets"] = self.mets_to_track + elif not any(["species" in graph, "mets" in graph]): + raise ValueError( + f"The specified graph {graph} must define species for which data will be plotted." + ) + print(f"graph_{graph_index}") + pprint(graph) + + # define figure specifications + if publishing: + pyplot.rc("axes", titlesize=22, labelsize=28) + pyplot.rc("xtick", labelsize=24) + pyplot.rc("ytick", labelsize=24) + pyplot.rc("legend", fontsize=18) + if graph["parsed"]: + parsed_graphs = {} + for species in graph["species"]: + parsed_graphs[species] = pyplot.subplots(dpi=200, figsize=(11, 7)) + else: + fig, ax = pyplot.subplots(dpi=200, figsize=(11, 7)) + yscale = "linear" + + # populate the figures + for trial, basenames in self.values.items(): + if trial not in graph["trial"]: + continue + labels = [] + for basename, values in basenames.items(): + # graph experimental and total simulated biomasses + if any([x in graph["content"] for x in ["biomass", "OD"]]): + if "b_" in basename: + vals = list(map(float, values.values())) + var_name, species, phenotype = basename.split("_") + # ic(basename) + label = f"{species}_biomass (model)" + if publishing: + species_name = graph["painting"][species]["name"] + label = f"{species_name} total (model)" + labels.append({species: label}) + if remove_empty_plots and all([v == 0 for v in vals]): + print(f"The {basename} is empty and thus is removed.") + continue + if ( + any( + [ + x in graph["content"] + for x in ["total", "biomass", "OD"] + ] + ) + or graph["species"] == self.species_list + ): # and not graph["parsed"]: + total_biomasses["OD"].append(vals) + if "OD" not in graph["content"]: + total_biomasses[species].append(vals) + if all( + [ + graph["experimental_data"], + "|bio" in basename, + ] + ): + # any([content in basename])]): # TODO - any() must include all_biomass and total + species, signal, phenotype = basename.split("|") + label = basename + if publishing: + species_name = ( + "total" + if "OD" in signal + else graph["painting"][species]["name"] + ) + label = f"Experimental {species_name} (from {signal})" + # print(basename, label, self.values[trial][basename].values()) + if remove_empty_plots and all( + self.values[trial][basename].values() == 0 + ): + print(f"The {basename} is empty and thus is removed.") + continue + ax, labels = self._add_plot( + ax, + labels, + label, + basename, + trial, + x_axis_split, + scatter=True, + color=self.adjust_color( + graph["painting"][species]["color"], 1.5 + ), + ) + + if content not in basename: + continue + # graph individual phenotypes + if "phenotype" in graph: + # print(graph['phenotype']) + for specie in graph["species"]: + if specie not in basename: + continue + if not any([p in basename for p in graph["phenotype"]]): + print(f"{basename} data with unknown phenotype.") + continue + if remove_empty_plots and all( + self.values[trial][basename].values() == 0 + ): + print(f"The {specie} is empty and thus is removed.") + continue + if graph["parsed"]: + fig, ax = parsed_graphs[specie] + ## define graph characteristics + label = basename.split("_")[-1] + style = "solid" + if len(graph["species"]) > 1: + label = re.sub(r"(^[a-b]+\_)", "", basename) + style = graph["painting"][specie]["linestyle"] + ax, labels = self._add_plot( + ax, labels, label, basename, trial, x_axis_split, style + ) + if graph["parsed"]: + parsed_graphs[specie] = (fig, ax) + # graph media concentration plots + elif "mets" in graph and all( + [ + any([x in basename for x in graph["mets"]]), + "c_cpd" in basename, + ] + ): + if not any( + np.array(list(self.values[trial][basename].values())) + > mM_threshold + ): + continue + if remove_empty_plots and all( + self.values[trial][basename].values() == 0 + ): + continue + label = self.msdb.compounds.get_by_id( + re.search(r"(cpd\d+)", basename).group() + ).name + ax, labels = self._add_plot( + ax, labels, label, basename, trial, x_axis_split + ) + yscale = "log" + y_label = r"Concentration ($mM$)" + + if labels: # assesses whether graph(s) were created + ## graph all of the total biomasses + if any([x in graph["content"] for x in ["OD", "biomass", "total"]]): + labeled_species = [ + label for label in labels if isinstance(label, dict) + ] + for name, vals in total_biomasses.items(): + # ic(name) + if not vals or ( + len(total_biomasses) == 2 and "OD" not in name + ): + continue + if len(total_biomasses) == 2: + specie_label = [ + graph["painting"][name]["name"] + for name in total_biomasses + if "OD" not in name + ][0] + label = f"{graph['painting'][name]['name']} ({specie_label})" + else: + label = f"{name}_biomass (model)" + if labeled_species: + for label_specie in labeled_species: + if name in label_specie: + label = label_specie[name] + break + style = ( + "solid" + if ( + len(graph["species"]) < 1 + or name not in graph["painting"] + ) + else graph["painting"][name]["linestyle"] + ) + style = "dashdot" if "model" in label else style + style = ( + "solid" + if ( + "OD" in name + and not graph["experimental_data"] + or "total" in graph["content"] + ) + else style + ) + total_biomass = sum(np.array(vals))[:-1] + xs = list(map(float, values.keys())) + if graph["parsed"]: + fig, ax = parsed_graphs[name] + self._add_plot( + ax, + labels, + label, + None, + None, + x_axis_split, + style, + False, + graph["painting"][name]["color"], + xs, + total_biomass, + ) + if graph["parsed"]: + ## process and export the parsed figures + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.grid(axis="y") + ax.set_yscale(yscale) + ax.legend() + phenotype_id = graph.get("phenotype", "") + if "phenotype" in graph and not isinstance( + graph["phenotype"], str + ): + phenotype_id = ( + f"{','.join(graph['phenotype'])} phenotypes" + ) + fig_name = f'{"_".join([trial, name, phenotype_id, content])}.jpg' + fig.savefig( + fig_name, bbox_inches="tight", transparent=True + ) + self.plots.add(fig_name) + + if graph["parsed"]: + continue + ## process and export the non-parsed figures + phenotype_id = graph.get("phenotype", "") + if "phenotype" in graph and not isinstance(graph["phenotype"], str): + phenotype_id = f"{','.join(graph['phenotype'])} phenotypes" + + species_id = "" + if "mets" not in graph and content != "c_": + species_id = ( + graph["species"] + if isinstance(graph["species"], str) + else ",".join(graph["species"]) + ) + if "species" in graph and graph["species"] == self.species_list: + species_id = "all species" + else: + phenotype_id = f"{','.join(graph['species'])} species" + if species_id == "all species" and not phenotype_id: + phenotype_id = ",".join(graph["species"]) + + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + if "mets" in graph: + ax.set_ylim(mM_threshold) + ax.grid(axis="y") + if len(labels) > 1: + ax.legend() + else: + yscale = "linear" + ax.set_yscale(yscale) + if not publishing: + if not title: + org_content = ( + content + if content not in contents.values() + else list(contents.keys())[ + list(contents.values()).index(content) + ] + ) + this_title = f"{org_content} of {species_id} ({phenotype_id}) in the {trial} trial" + if content == "c_": + this_title = f"{org_content} in the {trial} trial" + ax.set_title(this_title) + else: + ax.set_title(title) + fig_name = ( + f'{"_".join([trial, species_id, phenotype_id, content])}.jpg' + ) + if "mets" in graph: + fig_name = f"{trial}_{','.join(graph['mets'])}_c.jpg" + fig.savefig(fig_name, bbox_inches="tight", transparent=True) + + self.plots.add(fig_name) + + # export the figures with other simulation content + if export_zip_name: + with ZipFile(export_zip_name, "a", compression=ZIP_LZMA) as zp: + for plot in self.plots: + zp.write(plot) + os.remove(plot) + + #################### ENGINEERING PHASE METHODS #################### + + def engineering(self): + if not hasattr(self, "problem"): + self.fit() # TODO - accommodate both fitting a new model and loading an existing model + + # This will capture biomass variables at all times and trials, which seems undesirable + self.problem.objective = Objective( + sum([x for x in self.problem.variables if "bio" in x.name]) + ) + + # Use a community COBRA model and CommKinetics with the fitted kinetic parameters? + + def _add_phenotypes(self): + pass + + def _change_obj(self): + pass + + +class BIOLOGPhitting(CommPhitting): + def __init__( + self, + carbon_conc, + media_conc, + biolog_df, + fluxes_df, + experimental_metadata, + msdb_path, + community_members, + ): + self.biolog_df = biolog_df + self.experimental_metadata = experimental_metadata + self.carbon_conc = carbon_conc + self.media_conc = media_conc or [] + self.fluxes_df = fluxes_df + self.phenotypes = list(self.fluxes_df.columns) + self.phenotypes.extend( + [ + signal_species(signal) + "_stationary" + for signal in self.biolog_df + if ":" in signal + ] + ) + self.community_members = community_members + # import os + from modelseedpy.biochem import from_local + + self.msdb_path = msdb_path + self.msdb = from_local(msdb_path) + + def fitAll( + self, + parameters: dict = None, + rel_final_conc: float = None, + abs_final_conc: dict = None, + graphs: list = None, + data_timesteps: dict = None, + export_zip_name: str = None, + export_parameters: bool = True, + requisite_biomass: dict = None, + figures_zip_name: str = None, + publishing: bool = False, + ): + # simulate each condition + if export_zip_name and os.path.exists(export_zip_name): + os.remove(export_zip_name) + org_rel_final_conc = rel_final_conc + # total_reactions = set(list(chain.from_iterable([model.reactions for model in models_dict.values()]))) + model_abbreviations = ",".join( + [content["name"] for content in self.community_members.values()] + ) + for exp_index, experiment in self.experimental_metadata.iterrows(): + print(f"\n{exp_index} {experiment}") + display(experiment) + pheno = experiment["ModelSEED_ID"] + if not pheno: + print("The BIOLOG condition is not defined.") + continue + for model in self.community_members: + cpd = self.msdb.compounds.get_by_id(pheno) + if "C" not in cpd.elements or not any( + [re.search(pheno, rxn.id) for rxn in model.reactions] + ): + if "valid_condition" not in locals(): + valid_condition = False + continue + exp_list = [pheno] if isinstance(pheno, str) else pheno + self.community_members[model].update( + { + "phenotypes": { + re.sub(r"(-|\s)", "", experiment["condition"]): { + "consumed": exp_list + } + } + } + ) + # determine the requisite biomass for each condition based on which member consumes the compound + valid_condition = True + # proceed if none of the members can utilize the phenotype condition + if not valid_condition: + print( + f"The BIOLOG condition with {experiment['ModelSEED_ID']} is not" + f" absorbed by the {model_abbreviations} model(s)." + ) + continue + print( + f"The {experiment['ModelSEED_ID']} ({cpd.formula}) metabolite of the " + f"{experiment['condition']} condition may feed the {model_abbreviations} model(s)." + ) + if not any( + [experiment["ModelSEED_ID"] in pheno for pheno in self.phenotypes] + ): + print(e) + print( + f"The {experiment['ModelSEED_ID']} ({cpd.formula}) metabolite of the " + f"{experiment['condition']} condition is not a suitable phenotype for " + f"the {model_abbreviations} model(s)." + ) + continue + + # for exp_index, experiment in self.experimental_metadata.iterrows(): + # the model(s) for which the condition is a suitable carbon source must be defined here + # simulate through the kinetics ranges with conditions that can be used by one of members + rel_final_conc = {experiment["ModelSEED_ID"]: org_rel_final_conc} + export_path = os.path.join( + os.getcwd(), "BIOLOG_LPs", f"{exp_index}_{','.join(exp_list)}.lp" + ) + kcat_primal = None + for coef_index, coefs in enumerate(biomass_partition_coefs): + # solve for growth rate constants with the previously solved biomasses + new_simulation = CommPhitting( + self.fluxes_df, + self.carbon_conc, + self.media_conc, + self.msdb_path, + self.biolog_df.loc[exp_index, :], + self.experimental_metadata, + ) + new_simulation.define_problem( + parameters, + exp_list, + rel_final_conc, + set( + list( + chain.from_iterable( + [ + content["excretions"] + for content in self.community_members.values() + ] + ) + ) + ), + abs_final_conc, + data_timesteps, + export_zip_name, + export_parameters, + export_path, + kcat_primal, + coefs, + requisite_biomass, + True, + ) + time1 = process_time() + primals_export_path = ( + primals_export_path or f"BIOLOG_{experiment['ModelSEED_ID']}.json" + ) + try: + new_simulation.compute( + graphs, + export_zip_name, + None, + publishing, + primals_export_path, + True, + ) + except NoFluxError as e: + print(e) + kcat_primal = parse_primals( + new_simulation.values, + coefs=coefs, + kcat_vals=new_simulation.parameters["kcat"], + ) + time2 = process_time() + print( + f"Done simulating with the coefficients for biomass partitions: {coef_index}" + f"\n{(time2 - time1) / 60} minutes" + ) + pprint(kcat_primal) + print("\n\n\n") + return {k: val for k, val in new_simulation.values.items() if "kcat" in k} diff --git a/modelseedpy/community/commscores_old.py b/modelseedpy/community/commscores_old.py new file mode 100644 index 00000000..ed0f2eed --- /dev/null +++ b/modelseedpy/community/commscores_old.py @@ -0,0 +1,1856 @@ +from modelseedpy.core.exceptions import ObjectiveError, ParameterError +from modelseedpy.community.commhelper import build_from_species_models +from modelseedpy.community.mscompatibility import MSCompatibility +from modelseedpy.core.msminimalmedia import MSMinimalMedia +from modelseedpy.community.mscommunity import MSCommunity +from modelseedpy.core.msmodelutl import MSModelUtil +from modelseedpy.core.fbahelper import FBAHelper +from modelseedpy.core.msgapfill import MSGapfill +from itertools import combinations, permutations, chain +from optlang import Variable, Constraint, Objective +from numpy import array, unique, ndarray, where, sort, array_split, nan +from collections import Counter +from deepdiff import DeepDiff # (old, new) +from typing import Iterable, Union +from pprint import pprint +from numpy.random import shuffle +from multiprocess import current_process +from math import inf +import sigfig + +# from icecream import ic +import re + +# from math import prod + +# silence deprecation warnings from DeepDiff parsing the syntrophy +import warnings + +warnings.simplefilter("ignore", category=DeprecationWarning) + +rm_comp = FBAHelper.remove_compartment + + +def _compatibilize(member_models: Iterable, printing=False): + # return member_models + models = MSCompatibility.standardize( + member_models, conflicts_file_name="exchanges_conflicts.json", printing=printing + ) + if not isinstance(member_models, (set, list, tuple)): + return models[0] + return models + + +def _load_models( + member_models: Iterable, com_model=None, compatibilize=True, printing=False +): + # ic(member_models, com_model, compatibilize) + if not com_model and member_models: + model = build_from_species_models(member_models, name="SMETANA_pair") + return member_models, model # (model, names=names, abundances=abundances) + # models = PARSING_FUNCTION(community_model) # TODO the individual models of a community model can be parsed + if compatibilize: + return ( + _compatibilize(member_models, printing), + _compatibilize([com_model], printing)[0], + ) + return member_models, com_model + + +def _get_media( + media=None, + com_model=None, + model_s_=None, + min_growth=None, + environment=None, + interacting=True, + printing=False, + minimization_method="minFlux", + skip_bad_media=False, +): + # ic(media, com_model, model_s_) + if com_model is None and model_s_ is None: + raise TypeError("< com_model > or < model_s_ > must be parameterized.") + if media is not None: + if model_s_ is not None and not isinstance(model_s_, (list, set, tuple)): + return media["members"][model_s_.id]["media"] + elif com_model is not None: + return media["community_media"] + return media + # model_s_ is either a singular model or a list of models + if com_model is not None: + try: + com_media, media_sol = MSMinimalMedia.determine_min_media( + com_model, + minimization_method, + min_growth, + None, + interacting, + 5, + printing, + ) + except Exception as e: + if skip_bad_media: + com_media, media_sol = None, None + else: + print(e) + if model_s_ is not None: + if not isinstance(model_s_, (list, set, tuple, ndarray)): + try: + return MSMinimalMedia.determine_min_media( + model_s_, + minimization_method, + min_growth, + environment, + interacting, + printing, + ) + except Exception as e: + if not skip_bad_media: + print(e) + return None + members_media = {} + for model in model_s_: + try: + members_media[model.id] = { + "media": MSMinimalMedia.determine_min_media( + model, + minimization_method, + min_growth, + environment, + interacting, + printing, + )[0] + } + continue + except Exception as e: + if skip_bad_media: + continue + else: + print(e) + # print(members_media) + if com_model is None: + return members_media + else: + return com_media, media_sol + return {"community_media": com_media, "members": members_media} + + +def _sigfig_check(value, sigfigs, default): + if str(value) in ["inf", "nan"]: + value = "" + if FBAHelper.isnumber(value): + return sigfig.round(value, sigfigs) + else: + return default + + +def nanFilter(value, string=True): + if isinstance(value, str) or value is None: + if string: + return value + else: + return nan + if any([value < 0, value > 1e5]): + return "" if string else nan + return value + + +class CommScores: + def __init__( + self, + member_models, + min_growth=0.1, + n_solutions=100, + environment=None, + abstol=1e-3, + media_dict=None, + printing=True, + raw_content=False, + antismash_json_path: str = None, + antismash_zip_path: str = None, + minimal_media_method="minFlux", + ): + self.min_growth = min_growth + self.abstol = abstol + self.n_solutions = n_solutions + self.printing = printing + self.raw_content = raw_content + self.antismash_json_path = antismash_json_path + self.antismash_zip_path = antismash_zip_path + + # process the models + self.models = _compatibilize(member_models) + self.community = MSModelUtil(build_from_species_models(self.models)) + ## define the environment + if environment: + if hasattr(environment, "get_media_constraints"): + ### standardize modelseed media into COBRApy media + environment = { + "EX_" + exID: -bound[0] + for exID, bound in environment.get_media_constraints().items() + } + self.community.add_medium(environment) + self.environment = environment + ## test growth + for model in self.models: + if model.slim_optimize() == 0: + raise ObjectiveError( + f"The model {model.id} possesses an objective value of 0 in complete media, " + "which is incompatible with minimal media computations and hence SMETANA." + ) + if self.community.model.slim_optimize() == 0: + raise ObjectiveError( + f"The community model {self.community.model.id} possesses an objective " + "value of 0 in complete media, which is incompatible with minimal " + "media computations and hence SMETANA." + ) + ## determine the minimal media for each model, including the community + self.media = ( + media_dict + if media_dict + else MSMinimalMedia.comm_media_est( + member_models, + self.community.model, + minimal_media_method, + min_growth, + self.environment, + True, + n_solutions, + printing, + ) + ) + + def all_scores( + self, + mp_score=True, + kbase_obj=None, + cobrakbase_path: str = None, + kbase_token_path: str = None, + annotated_genomes: dict = None, + ): + mro = self.mro_score() + mip = self.mip_score(interacting_media=self.media) + mp = None if not mp_score else self.mp_score() + mu = None # self.mu_score() + sc = None # self.sc_score() + smetana = None # self.smetana_score() + gyd = self.gyd_score() + fs = ( + self.fs_score() + if any( + [ + kbase_obj is not None, + annotated_genomes != [], + cobrakbase_path is not None and kbase_token_path is not None, + ] + ) + else None + ) + return { + "mro": mro, + "mip": mip, + "mp": mp, + "mu": mu, + "sc": sc, + "smetana": smetana, + "gyd": gyd, + "fs": fs, + } + + def mro_score(self): + self.mro_val = CommScores.mro( + self.models, + self.media["members"], + self.min_growth, + self.media, + self.raw_content, + self.environment, + self.printing, + True, + ) + if not self.printing: + return self.mro_val + if self.raw_content: + for pair, (interaction, media) in self.mro_val.items(): + newcomer, established = pair.split("---") + print( + f"\n(MRO) The {newcomer} media {media} possesses {interaction} shared " + f"requirements with the {established} established member." + ) + return self.mro_val + for pair, mro in self.mro_val.items(): + newcomer, established = pair.split("---") + print( + f"\nThe {newcomer} on {established} MRO score: {mro[0]} ({mro[0]*100:.2f}%). " + f"This is the percent of nutritional requirements in {newcomer} " + f"that overlap with {established} ({mro[1]}/{mro[2]})." + ) + return self.mro_val + + def mip_score( + self, interacting_media: dict = None, noninteracting_media: dict = None + ): + interacting_media = interacting_media or self.media or None + diff, self.mip_val = CommScores.mip( + self.models, + self.community.model, + self.min_growth, + interacting_media, + noninteracting_media, + self.environment, + self.printing, + True, + ) + if not self.printing: + return self.mip_val + print( + f"\nMIP score: {self.mip_val}\t\t\t{self.mip_val} required compound(s) can be sourced via syntrophy:" + ) + if self.raw_content: + pprint(diff) + return self.mip_val + + def gyd_score(self, coculture_growth=False): + self.gyd_val = CommScores.gyd( + self.models, environment=self.environment, coculture_growth=coculture_growth + ) + if not self.printing: + return self.gyd + growth_type = "monocultural" if not coculture_growth else "cocultural" + for pair, score in self.gyd_val.items(): + print( + f"\nGYD score: The {growth_type} growth difference between the {pair} member models" + f" is {score} times greater than the growth of the slower member." + ) + return self.gyd + + def fs_score( + self, + kbase_obj=None, + cobrakbase_path: str = None, + kbase_token_path: str = None, + annotated_genomes: dict = None, + ): + self.fs_val = CommScores.fs( + self.models, kbase_obj, cobrakbase_path, kbase_token_path, annotated_genomes + ) + if not self.printing: + return self.fs + for pair, score in self.fs_val.items(): + print( + f"\nFS Score: The similarity of RAST functional SSO ontology " + f"terms between the {pair} members is {score}." + ) + return self.fs + + def mp_score(self): + print("executing MP") + self.mp_val = CommScores.mp( + self.models, + self.environment, + self.community.model, + None, + self.abstol, + self.printing, + ) + if not self.printing: + return self.mp_val + if self.raw_content: + print( + "\n(MP) The possible contributions of each member in the member media include:\n" + ) + pprint(self.mp_val) + else: + print( + "\nMP score:\t\t\tEach member can possibly contribute the following to the community:\n" + ) + for member, contributions in self.mp_val.items(): + print(member, "\t", len(contributions)) + return self.mp_val + + def mu_score(self): + member_excreta = self.mp_score() if not hasattr(self, "mp_val") else self.mp_val + self.mu_val = CommScores.mu( + self.models, + self.environment, + member_excreta, + self.n_solutions, + self.abstol, + True, + self.printing, + ) + if not self.printing: + return self.mu_val + print( + "\nMU score:\t\t\tThe fraction of solutions in which each member is the " + "syntrophic receiver that contain a respective metabolite:\n" + ) + pprint(self.mu_val) + return self.mu_val + + def sc_score(self): + self.sc_val = CommScores.sc( + self.models, + self.community.model, + self.min_growth, + self.n_solutions, + self.abstol, + True, + self.printing, + ) + if not self.printing: + return self.sc_val + print( + "\nSC score:\t\t\tThe fraction of community members who syntrophically contribute to each species:\n" + ) + pprint(self.sc_val) + return self.sc_val + + def smetana_score(self): + if not hasattr(self, "sc_val"): + self.sc_val = self.sc_score() + sc_coupling = all(array(list(self.sc.values())) is not None) + if not hasattr(self, "mu_val"): + self.mu_val = self.mu_score() + if not hasattr(self, "mp_val"): + self.mp_val = self.mp_score() + + self.smetana = CommScores.smetana( + self.models, + self.community.model, + self.min_growth, + self.n_solutions, + self.abstol, + (self.sc_val, self.mu_val, self.mp_val), + True, + sc_coupling, + self.printing, + ) + if self.printing: + print("\nsmetana score:\n") + pprint(self.smetana) + return self.smetana + + def antiSMASH_scores(self, antismash_json_path=None): + self.antismash = CommScores.antiSMASH( + antismash_json_path or self.antismash_json_path + ) + if not self.printing: + return self.antismash + if self.raw_content: + print( + "\n(antismash) The biosynthetic_areas, BGCs, protein_annotations, clusterBlast, and " + "num_clusterBlast from the provided antiSMASH results:\n" + ) + print( + "The 'areas' that antiSMASH determines produce biosynthetic products:" + ) + pprint(self.antismash[0]) + print("The set of biosynthetic gene clusters:") + pprint(self.antismash[1]) + print("The set of clusterblast protein annotations:") + pprint(self.antismash[2]) + print("Resistance information from clusterblast") + pprint(self.antismash[3]) + print("The number of proteins associated with resistance") + pprint(self.antismash[4]) + return self.antismash + print("\nantiSMASH scores:\n") + print( + "The community exhibited:" + f"- {len(self.antismash[0])}'areas' that antiSMASH determines produce biosynthetic products." + f"- {len(self.antismash[1])} biosynthetic gene clusters." + f"- {len(self.antismash[2])} clusterblast protein annotations." + f"- {len(self.antismash[3])} parcels of resistance information from clusterblast." + f"- {self.antismash[4]} proteins associated with resistance." + ) + return list(map(len, self.antismash[:4])) + [self.antismash[4]] + + ###### STATIC METHODS OF THE SMETANA SCORES, WHICH ARE APPLIED IN THE ABOVE CLASS OBJECT ###### + + @staticmethod + def _check_model(model_util, media, model_str, skip_bad_media): + default_media = model_util.model.medium + if media is not None: + model_util.add_medium(media) + obj_val = model_util.model.slim_optimize() + if obj_val == 0 or not FBAHelper.isnumber(obj_val): + print( + f"The {model_str} model input does not yield an operational model, and will therefore be gapfilled." + ) + # if not skip_bad_media: return MSGapfill.gapfill(model_util.model, media) + model_util.add_medium(default_media) + return model_util.model + + @staticmethod + def _load(model, kbase_obj): + model_str = model + if len(model) == 2: + model = kbase_obj.get_from_ws(*model) + else: + model = kbase_obj.get_from_ws(model) + return model, model_str + + @staticmethod + def _determine_growths(modelUtils): + return [util.model.slim_optimize() for util in modelUtils] + + @staticmethod + def calculate_scores( + pairs, + models_media=None, + environments=None, + annotated_genomes=True, + lazy_load=False, + kbase_obj=None, + cip_score=True, + costless=True, + skip_bad_media=False, + anme_comm=False, + print_progress=False, + ): + from pandas import Series + + if isinstance(pairs, list): + ( + pairs, + models_media, + environments, + annotated_genomes, + lazy_load, + kbase_obj, + ) = pairs + series, mets = [], [] + if not isinstance(environments, (list, tuple)): + environments = [environments] + if isinstance(environments, (list, tuple)) and hasattr(environments[0], "name"): + environments = { + m.name: FBAHelper.convert_kbase_media(m, 1000) for m in environments + } + elif not isinstance(environments, dict): + environments = {f"media{i}": m for i, m in enumerate(environments)} + pid = current_process().name + model_utils = {} + count = 0 + for model1, models in pairs.items(): + if model1.id == "": + model1.id = "model1" + if lazy_load: + model1, model1_str = CommScores._load(model1, kbase_obj) + else: + model1_str = model1.id + if model1.id not in models_media: + models_media[model1.id] = { + "media": _get_media(model_s_=model1, skip_bad_media=skip_bad_media) + } + if models_media[model1.id] is None: + continue + if model1.id not in model_utils: + model_utils[model1.id] = MSModelUtil(model1) + # print(pid, model1) + for model_index, model2 in enumerate(models): + if model2.id == "": + model2.id = "model2" + if lazy_load: + model2, model2_str = CommScores._load(model2, kbase_obj) + else: + model2_str = model2.id + if model2.id not in models_media: + models_media[model2.id] = { + "media": _get_media( + model_s_=model2, skip_bad_media=skip_bad_media + ) + } + if models_media[model2.id] is None: + continue + if model2.id not in model_utils: + model_utils[model2.id] = MSModelUtil(model2) + grouping = [model1, model2] + grouping_utils = [model_utils[model1.id], model_utils[model2.id]] + modelIDs = [model.id for model in grouping] + comm_model = build_from_species_models(grouping) + community = MSCommunity(comm_model, ids=modelIDs) + comm_sol = comm_model.optimize() + print(f"{pid}~~{count}\t{modelIDs}") + for environName, environ in environments.items(): + if print_progress: + print(f"\tEnvironment\t{environName}", end="\t") + if not anme_comm: + model1 = CommScores._check_model( + model_utils[model1.id], environ, model1_str, skip_bad_media + ) + model2 = CommScores._check_model( + model_utils[model2.id], environ, model2_str, skip_bad_media + ) + # initiate the KBase output + report_dic = { + f"model{i+1}": modelID for i, modelID in enumerate(modelIDs) + } + g1, g2, comm = CommScores._determine_growths( + [model_utils[model1.id], model_utils[model2.id], community.util] + ) + g1, g2, comm = ( + _sigfig_check(g1, 5, ""), + _sigfig_check(g2, 5, ""), + _sigfig_check(comm, 5, ""), + ) + report_dic.update( + { + "media": environName, + "model1 growth": g1, + "model2 growth": g2, + "community growth": comm, + } + ) + coculture_growths = { + mem.id: comm_sol.fluxes[mem.primary_biomass.id] + for mem in community.members + } + report_dic.update( + { + f"coculture growth model{modelIDs.index(memID)}": growth + for memID, growth in coculture_growths.items() + } + ) + # define the MRO content + mro_values = CommScores.mro( + grouping, models_media, raw_content=True, environment=environ + ) + report_dic.update( + { + f"MRO_model{modelIDs.index(models_string.split('--')[0])+1}": f"{100*len(intersection)/len(memMedia):.3f}% ({len(intersection)}/{len(memMedia)})" + for models_string, ( + intersection, + memMedia, + ) in mro_values.items() + } + ) + mets.append({"MRO metabolites": list(mro_values.values())[0][0]}) + if print_progress: + print("MRO done", end="\t") + # define the CIP content + if cip_score: + cip_values = CommScores.cip( + modelutils=[model_utils[mem.id] for mem in grouping] + ) + report_dic.update({"CIP": cip_values[1]}) + mets[-1].update({"CIP metabolites": list(cip_values[0])}) + if print_progress: + print("CIP done", end="\t") + # define the MIP content + mip_values = CommScores.mip( + grouping, + comm_model, + 0.1, + None, + None, + environ, + print_progress, + True, + costless, + costless, + skip_bad_media, + ) + # print(mip_values) + if mip_values is not None: + report_dic.update( + { + f"MIP_model{modelIDs.index(models_name)+1}": str( + len(received) + ) + for models_name, received in mip_values[0].items() + } + ) + mets[-1].update( + { + "MIP model1 metabolites": list(mip_values[0].values())[ + 0 + ], + "MIP model2 metabolites": list(mip_values[0].values())[ + 1 + ], + } + ) + if costless: + for models_name, received in mip_values[1].items(): + report_dic[ + f"MIP_model{modelIDs.index(models_name)+1} (costless)" + ] = ( + report_dic[ + f"MIP_model{modelIDs.index(models_name)+1}" + ] + + f" ({len(received)})" + ) + del report_dic[ + f"MIP_model{modelIDs.index(models_name)+1}" + ] + if print_progress: + print("costless_MIP done", end="\t") + else: + report_dic.update( + {f"MIP_model1 (costless)": "", f"MIP_model2 (costless)": ""} + ) + mets[-1].update( + { + "MIP model1 metabolites": [None], + "MIP model2 metabolites": [None], + } + ) + if print_progress: + print("MIP done", end="\t") + # define the BSS content + bss_values = CommScores.bss( + grouping, + grouping_utils, + environments, + models_media, + skip_bad_media, + ) + report_dic.update( + { + f"BSS_model{modelIDs.index(name.split(' supporting ')[0])+1}": f"{_sigfig_check(100*val, 5, '')}%" + for name, (mets, val) in bss_values.items() + } + ) + mets[-1].update( + { + "BSS model1 metabolites": [ + met_set for met_set, val in bss_values.values() + ][0], + "BSS model2 metabolites": [ + met_set for met_set, val in bss_values.values() + ][1], + } + ) + # mets[-1].update({"bss_mets": list(bss_values[0].values())}) + if print_progress: + print("BSS done", end="\t") + # define the PC content + pc_values = CommScores.pc( + grouping, + grouping_utils, + comm_model, + None, + comm_sol, + environ, + True, + community, + ) + report_dic.update( + { + "PC_comm": _sigfig_check(pc_values[0], 5, ""), + "PC_model1": _sigfig_check( + list(pc_values[1].values())[0], 5, "" + ), + "PC_model2": _sigfig_check( + list(pc_values[1].values())[1], 5, "" + ), + "BIT": pc_values[3], + } + ) + if print_progress: + print("PC done\tBIT done", end="\t") + # print([mem.slim_optimize() for mem in grouping]) + # define the GYD content + gyd1, gyd2, g1, g2 = list( + CommScores.gyd( + grouping, + grouping_utils, + environ, + False, + community, + anme_comm, + ).values() + )[0] + report_dic.update( + { + "GYD1": _sigfig_check(gyd1, 5, ""), + "GYD2": _sigfig_check(gyd2, 5, ""), + } + ) + if print_progress: + print("GYD done\t\t", end="\t" if annotated_genomes else "\n") + # define the FS content + if kbase_obj is not None and annotated_genomes and not anme_comm: + fs_values = list( + CommScores.fs( + grouping, kbase_obj, annotated_genomes=annotated_genomes + ).values() + )[0] + print( + len(fs_values[0]) if fs_values[0] is not None else "NaN", + fs_values[1], + ) + report_dic.update({"FS": sigfig.round(fs_values[1], 5)}) + if fs_values is not None: + mets[-1].update({"FS features": fs_values[0]}) + if print_progress: + print("FS done\t\t") + # return a pandas Series, which can be easily aggregated with other results into a DataFrame + series.append(Series(report_dic)) + count += 1 + return series, mets + + @staticmethod + def html_report( + df, mets, export_html_path="commscores_report.html", msdb_path=None + ): + from modelseedpy.core.report import commscores_report + + return commscores_report(df, mets, export_html_path, msdb_path) + + @staticmethod + def report_generation( + all_models: iter = None, # a list of distinct lists is provided for specifying exclusive groups + pairs: dict = None, + mem_media: dict = None, + pair_limit: int = None, + exclude_pairs: list = None, + kbase_obj=None, + annotated_genomes: dict = True, # True triggers internal acquisition of the genomes, where None skips + see_media=True, + environments: iter = None, # a collection of environment dicts or KBase media objects + pool_size: int = None, + cip_score=True, + costless=True, + skip_bad_media=False, + anme_comm=False, + print_progress=False, + ): + from pandas import concat + + if pairs: + model_pairs = unique( + [ + {model1, model2} + for model1, models in pairs.items() + for model2 in models + ] + ) + elif all_models is not None: + if not isinstance(all_models[0], list): + all_models = list(set(all_models)) + model_pairs = array(list(combinations(all_models, 2))) + else: + model_pairs = [] + for models1, models2 in combinations(all_models, 2): + models1 = set(models1) + models2 = set(models2) + if len(models1) > len(models2): + larger_list = models1 + smaller_list = models2 + else: + larger_list = models2 + smaller_list = models1 + model_pairs.append( + [ + list(zip(combin, smaller_list)) + for combin in permutations(larger_list, len(smaller_list)) + ] + ) + # flatten the assembled pairs and filter duplicates + model_pairs = array( + [ + x + for x in set( + tuple(x) + for x in [ + i + for y in list(chain.from_iterable(model_pairs)) + for i in y + ] + ) + ] + ) + all_models = list(chain.from_iterable(all_models)) + if pair_limit is not None: + shuffle(model_pairs) + new_pairs = [] + for index, pair in enumerate(model_pairs): + if set(pair) not in exclude_pairs and index < pair_limit: + new_pairs.append(pair) + elif index >= pair_limit: + break + model_pairs = array(new_pairs) + if isinstance(model_pairs[0], str): + model_pairs = unique(sort(model_pairs, axis=1)) + pairs = { + first: model_pairs[where(model_pairs[:, 0] == first)][:, 1] + for first in model_pairs[:, 0] + } + else: + raise ValueError( + "Either < all_models > or < pairs > must be defined to simulate interactions." + ) + if not all_models: + all_models = list( + chain(*[list(values) for values in pairs.values()]) + ) + list(pairs.keys()) + lazy_load = len(model_pairs) > 10000 # all_models[0], (list,set,tuple)) + if lazy_load and not kbase_obj: + ValueError( + "The < kbase_obj > argument must be provided to lazy load models." + ) + new_models = [] + for index, model in enumerate(all_models): + if model.id == "": + model.id = f"model_index{index}" + new_models.append(model) + all_models = new_models[:] + if not mem_media: + models_media = _get_media( + model_s_=all_models, skip_bad_media=skip_bad_media + ) + else: + models_media = mem_media.copy() + missing_models = set() + missing_modelID = [] + for model in all_models: + if model is not None and model.id not in models_media: + missing_models.add(model) + missing_modelID.append( + model if not hasattr(model, "id") else model.id + ) + if missing_models != set(): + print( + f"Media of the {missing_modelID} models are not defined, and will be calculated separately." + ) + models_media.update( + _get_media(model_s_=missing_models), skip_bad_media=skip_bad_media + ) + if see_media: + print(f"The minimal media of all members:\n{models_media}") + print(f"\nExamining the {len(list(model_pairs))} model pairs") + if pool_size is not None: + from datetime import datetime + from multiprocess import Pool + + print( + f"Loading {int(pool_size)} workers and computing the scores", + datetime.now(), + ) + pool = Pool( + int(pool_size) + ) # .map(calculate_scores, [{k: v} for k,v in pairs.items()]) + args = [ + [ + dict([pair]), + models_media, + environments, + annotated_genomes, + lazy_load, + kbase_obj, + ] + for pair in list(pairs.items()) + ] + output = pool.map(CommScores.calculate_scores, args) + series = chain.from_iterable([ele[0] for ele in output]) + mets = chain.from_iterable([ele[1] for ele in output]) + else: + series, mets = CommScores.calculate_scores( + pairs, + models_media, + environments, + annotated_genomes, + lazy_load, + kbase_obj, + cip_score, + costless, + skip_bad_media, + anme_comm, + print_progress, + ) + return concat(series, axis=1).T, mets + + @staticmethod + def mro( + member_models: Iterable = None, + mem_media: dict = None, + min_growth=0.1, + media_dict=None, + raw_content=False, + environment=None, + skip_bad_media=False, + printing=False, + compatibilized=False, + ): + """Determine the overlap of nutritional requirements (minimal media) between member organisms.""" + # determine the member minimal media if they are not parameterized + if not mem_media: + if not member_models: + raise ParameterError( + "The either member_models or minimal_media parameter must be defined." + ) + member_models = ( + member_models + if compatibilized + else _compatibilize(member_models, printing) + ) + mem_media = _get_media( + media_dict, + None, + member_models, + min_growth, + environment, + printing=printing, + skip_bad_media=skip_bad_media, + ) + if "community_media" in mem_media: + mem_media = mem_media["members"] + # MROs = array(list(map(len, pairs.values()))) / array(list(map(len, mem_media.values()))) + mro_values = {} + for model1, model2 in combinations(member_models, 2): + intersection = set(mem_media[model1.id]["media"].keys()) & set( + mem_media[model2.id]["media"].keys() + ) + inter = [ex.replace("EX_", "").replace("_e0", "") for ex in intersection] + m1_media = mem_media[model1.id]["media"] + m2_media = mem_media[model2.id]["media"] + if raw_content: + mro_values.update( + { + f"{model1.id}---{model2.id})": (inter, m1_media), + f"{model2.id}---{model1.id})": (inter, m2_media), + } + ) + else: + mro_values.update( + { + f"{model1.id}---{model2.id})": 100 + * (len(inter) / len(m1_media), len(inter), len(m1_media)), + f"{model2.id}---{model1.id})": 100 + * (len(inter) / len(m2_media), len(inter), len(m2_media)), + "mets": inter, + } + ) + return mro_values + # return mean(list(map(len, pairs.values()))) / mean(list(map(len, mem_media.values()))) + + @staticmethod + def mip( + member_models: Iterable, + com_model=None, + min_growth=0.1, + interacting_media_dict=None, + noninteracting_media_dict=None, + environment=None, + printing=False, + compatibilized=False, + costless=False, + multi_output=False, + skip_bad_media=False, + ): + """Determine the quantity of nutrients that can be potentially sourced through syntrophy""" + member_models, community = _load_models( + member_models, com_model, not compatibilized, printing=printing + ) + # determine the interacting and non-interacting media for the specified community .util.model + noninteracting_medium, noninteracting_sol = _get_media( + noninteracting_media_dict, + community, + None, + min_growth, + environment, + False, + skip_bad_media=skip_bad_media, + ) + if noninteracting_medium is None: + return None + if "community_media" in noninteracting_medium: + noninteracting_medium = noninteracting_medium["community_media"] + interacting_medium, interacting_sol = _get_media( + interacting_media_dict, + community, + None, + min_growth, + environment, + True, + skip_bad_media=skip_bad_media, + ) + if interacting_medium is None: + return None + if "community_media" in interacting_medium: + interacting_medium = interacting_medium["community_media"] + interact_diff = DeepDiff(noninteracting_medium, interacting_medium) + if "dictionary_item_removed" not in interact_diff: + return None + cross_fed_exIDs = [ + re.sub("(root\['|'\])", "", x) + for x in interact_diff["dictionary_item_removed"] + ] + # Determine each direction of the MIP score interactions + comm_util = MSModelUtil(community) + cross_fed_metIDs = [ + ex.replace("EX_", "").replace("_e0", "") for ex in cross_fed_exIDs + ] + cross_fed_copy = cross_fed_metIDs[:] + directionalMIP = {mem.id: [] for mem in member_models} + for rxn in comm_util.transport_list(): + # print(rxn.reaction, "\t", [met.id for met in rxn.metabolites if "_e0" in met.id]) + metIDs = list( + set([met.id.split("_")[0] for met in rxn.reactants]).intersection( + set([met.id.split("_")[0] for met in rxn.products]) + ) + ) + if len(metIDs) == 1: + metID = metIDs[0] + else: + if "cpd00067" in metIDs: + metIDs.remove("cpd00067") + metID = metIDs[0] + if metID not in cross_fed_metIDs: + continue + rxn_index = FBAHelper.compartment_index(rxn.id.split("_")[-1]) + if rxn_index == 0: + continue + mets = [met for met in rxn.metabolites if met.id == f"{metID}_c{rxn_index}"] + if mets == []: + print(f"The {metID}_c{rxn_index} is missing in {rxn.reaction}.") + continue + rxn_model = member_models[rxn_index - 1] + # comm_trans[metID] = comm_trans.get(f"{metID}_c{rxn_index}", {}) + if ( + rxn.metabolites[mets[0]] > 0 + and interacting_sol.fluxes[rxn.id] > 0 + or rxn.metabolites[mets[0]] < 0 + and interacting_sol.fluxes[rxn.id] < 0 + ): # donor + directionalMIP[rxn_model.id].append(metID) + if metID in cross_fed_copy: + cross_fed_copy.remove(metID) + continue + # if printing: print(f"{mets[0]} in {rxn.id} ({rxn.reaction}) is not assigned a receiving member.") + if cross_fed_copy != [] and printing: + print(f"Missing directions for the {cross_fed_copy} cross-fed metabolites") + outputs = [directionalMIP] + # TODO categorize all of the cross-fed substrates to examine potential associations of specific compounds + if costless: + costless_mets, numExs = CommScores.cip(member_models=member_models) + # print(list(directionalMIP.values()), costless_mets) + costlessDirectionalMIP = { + member_name: set(receive_mets).intersection(costless_mets) + for member_name, receive_mets in directionalMIP.items() + } + if not multi_output: + return costlessDirectionalMIP + outputs.append(costlessDirectionalMIP) + return outputs + + @staticmethod + def cip(modelutils=None, member_models=None): # costless interaction potential + if not modelutils: + modelutils = {MSModelUtil(model) for model in member_models} + costless_mets = set( + chain.from_iterable( + [modelutil.costless_excreta() for modelutil in modelutils] + ) + ) + return costless_mets, len(costless_mets) + + @staticmethod + def contributions(org_possible_contributions, scores, model_util, abstol): + # identify and log excreta from the solution + model_util.add_objective( + sum(ex_rxn.flux_expression for ex_rxn in org_possible_contributions) + ) + sol = model_util.model.optimize() + if sol.status != "optimal": + # exit the while loop by returning the original possible_contributions, + ## hence DeepDiff == {} and the while loop terminates + return scores, org_possible_contributions + # identify and log excreta from the solution + possible_contributions = org_possible_contributions[:] + for ex in org_possible_contributions: + if ex.id in sol.fluxes.keys() and sol.fluxes[ex.id] >= abstol: + possible_contributions.remove(ex) + scores[model_util.model.id].update([met.id for met in ex.metabolites]) + return scores, possible_contributions + + @staticmethod + def mp( + member_models: Iterable, + environment, + com_model=None, + minimal_media=None, + abstol=1e-3, + printing=False, + ): + """Discover the metabolites that each species can contribute to a community""" + community = ( + _compatibilize(com_model) + if com_model + else build_from_species_models(member_models, standardize=True) + ) + community.medium = minimal_media or MSMinimalMedia.minimize_flux(community) + scores = {} + for ( + org_model + ) in ( + member_models + ): # TODO support parsing the individual members through the MSCommunity object + model_util = MSModelUtil(org_model) + model_util.compatibilize(printing=printing) + if environment: + model_util.add_medium(environment) + scores[model_util.model.id] = set() + # determines possible member contributions in the community environment, where the excretion of media compounds is irrelevant + org_possible_contr = [ + ex_rxn + for ex_rxn in model_util.exchange_list() + if (ex_rxn.id not in community.medium and ex_rxn.upper_bound > 0) + ] + # ic(org_possible_contributions, len(model_util.exchange_list()), len(community.medium)) + scores, possible_contr = CommScores.contributions( + org_possible_contr, scores, model_util, abstol + ) + while DeepDiff(org_possible_contr, possible_contr): + print("remaining possible_contributions", len(possible_contr), end="\r") + ## optimize the sum of the remaining exchanges that have not surpassed the abstol + org_possible_contr = possible_contr[:] + scores, possible_contr = CommScores.contributions( + org_possible_contr, scores, model_util, abstol + ) + + ## individually checks the remaining possible contributions + for ex_rxn in possible_contr: + model_util.model.objective = Objective(ex_rxn.flux_expression) + sol = model_util.model.optimize() + if sol.status == "optimal" or sol.objective_value > abstol: + for met in ex_rxn.metabolites: + if met.id in scores[model_util.model.id]: + scores[model_util.model.id].remove(met.id) + print("removing", met.id) + return scores + + @staticmethod + def mu( + member_models: Iterable, + environment=None, + member_excreta=None, + n_solutions=100, + abstol=1e-3, + compatibilized=False, + printing=True, + ): + """the fractional frequency of each received metabolite amongst all possible alternative syntrophic solutions""" + # member_solutions = member_solutions if member_solutions else {model.id: model.optimize() for model in member_models} + scores = {} + member_models = ( + member_models if compatibilized else _compatibilize(member_models, printing) + ) + if member_excreta: + missing_members = [ + model for model in member_models if model.id not in member_excreta + ] + if missing_members: + print( + f"The {','.join(missing_members)} members are missing from the defined " + f"excreta list and will therefore be determined through an additional MP simulation." + ) + member_excreta.update(CommScores.mp(missing_members, environment)) + else: + member_excreta = CommScores.mp( + member_models, environment, None, abstol, printing + ) + for org_model in member_models: + other_excreta = set( + chain.from_iterable( + [ + excreta + for model, excreta in member_excreta.items() + if model != org_model.id + ] + ) + ) + print(f"\n{org_model.id}\tOther Excreta", other_excreta) + model_util = MSModelUtil(org_model, True) + if environment: + model_util.add_medium(environment) + ex_rxns = { + ex_rxn: list(ex_rxn.metabolites)[0] + for ex_rxn in model_util.exchange_list() + } + print(f"\n{org_model.id}\tExtracellular reactions", ex_rxns) + variables = { + ex_rxn.id: Variable( + "___".join([model_util.model.id, ex_rxn.id]), + lb=0, + ub=1, + type="binary", + ) + for ex_rxn in ex_rxns + } + model_util.add_cons_vars(list(variables.values())) + media, solutions = [], [] + sol = model_util.model.optimize() + while sol.status == "optimal" and len(solutions) < n_solutions: + solutions.append(sol) + medium = set( + [ + ex + for ex in ex_rxns + if sol.fluxes[ex.id] < -abstol and ex in other_excreta + ] + ) + model_util.create_constraint( + Constraint( + sum([variables[ex.id] for ex in medium]), + ub=len(medium) - 1, + name=f"iteration_{len(solutions)}", + ) + ) + media.append(medium) + sol = model_util.model.optimize() + counter = Counter(chain(*media)) + scores[model_util.model.id] = { + met.id: counter[ex] / len(media) + for ex, met in ex_rxns.items() + if counter[ex] > 0 + } + return scores + + @staticmethod + def sc( + member_models: Iterable = None, + com_model=None, + min_growth=0.1, + n_solutions=100, + abstol=1e-6, + compatibilized=True, + printing=False, + ): + """Calculate the frequency of interspecies dependency in a community""" + member_models, community = _load_models( + member_models, com_model, not compatibilized, printing=printing + ) + for rxn in com_model.reactions: + rxn.lower_bound = 0 if "bio" in rxn.id else rxn.lower_bound + + # c_{rxn.id}_lb: rxn < 1000*y_{species_id} + # c_{rxn.id}_ub: rxn > -1000*y_{species_id} + variables = {} + constraints = [] + # TODO this can be converted to an MSCommunity object by looping through each index + # leverage CommKinetics + for org_model in member_models: + model_util = MSModelUtil(org_model, True) + variables[model_util.model.id] = Variable( + name=f"y_{model_util.model.id}", lb=0, ub=1, type="binary" + ) + model_util.add_cons_vars([variables[model_util.model.id]]) + for rxn in model_util.model.reactions: + if "bio" not in rxn.id: + # print(rxn.flux_expression) + lb = Constraint( + rxn.flux_expression + 1000 * variables[model_util.model.id], + name="_".join(["c", model_util.model.id, rxn.id, "lb"]), + lb=0, + ) + ub = Constraint( + rxn.flux_expression - 1000 * variables[model_util.model.id], + name="_".join(["c", model_util.model.id, rxn.id, "ub"]), + ub=0, + ) + constraints.extend([lb, ub]) + + # calculate the SCS + scores = {} + for model in member_models: + com_model_util = MSModelUtil(com_model) + com_model_util.add_cons_vars(constraints, sloppy=True) + # model growth is guaranteed while minimizing the growing members of the community + ## SMETANA_Biomass: {biomass_reactions} > {min_growth} + com_model_util.create_constraint( + Constraint( + sum( + rxn.flux_expression + for rxn in model.reactions + if "bio" in rxn.id + ), + name="SMETANA_Biomass", + lb=min_growth, + ) + ) # sloppy = True) + other_members = [other for other in member_models if other.id != model.id] + com_model_util.add_objective( + sum([variables[other.id] for other in other_members]), "min" + ) + previous_constraints, donors_list = [], [] + for i in range(n_solutions): + sol = com_model.optimize() # FIXME The solution is not optimal + if sol.status != "optimal": + scores[model.id] = None + break + donors = [ + o + for o in other_members + if com_model.solver.primal_values[f"y_{o.id}"] > abstol + ] + donors_list.append(donors) + previous_con = f"iteration_{i}" + previous_constraints.append(previous_con) + com_model_util.add_cons_vars( + [ + Constraint( + sum(variables[o.id] for o in donors), + name=previous_con, + ub=len(previous_constraints) - 1, + ) + ], + sloppy=True, + ) + if i != 0: + donors_counter = Counter(chain(*donors_list)) + scores[model.id] = { + o.id: donors_counter[o] / len(donors_list) for o in other_members + } + return scores + + @staticmethod + def gyd( + member_models: Iterable = None, + model_utils: Iterable = None, + environment=None, + coculture_growth=False, + community=None, + anme_comm=False, + ): + gyds = {} + for combination in combinations(model_utils or member_models, 2): + if model_utils is None: + model1_util = MSModelUtil(combination[0], True) + model2_util = MSModelUtil(combination[1], True) + print( + f"{model1_util.model.id} ++ {model2_util.model.id}", + model1_util.model.slim_optimize(), + model2_util.model.slim_optimize(), + ) + if environment and not anme_comm: + model1_util.add_medium(environment) + model2_util.add_medium(environment) + else: + model1_util = combination[0] + model2_util = combination[1] + if not coculture_growth: + G_m1, G_m2 = CommScores._determine_growths([model1_util, model2_util]) + G_m1, G_m2 = G_m1 if FBAHelper.isnumber(str(G_m1)) else 0, ( + G_m2 if FBAHelper.isnumber(str(G_m2)) else 0 + ) + else: + community = community or MSCommunity( + member_models=[model1_util.model, model2_util.model], + ids=[mem.id for mem in member_models], + ) + community.run_fba() + member_growths = community.parse_member_growths() + G_m1, G_m2 = ( + member_growths[model1_util.model.id], + member_growths[model2_util.model.id], + ) + if G_m2 <= 0 or G_m1 <= 0: + gyds[f"{model1_util.model.id} ++ {model2_util.model.id}"] = ( + "", + "", + G_m1, + G_m2, + ) + continue + gyds[f"{model1_util.model.id} ++ {model2_util.model.id}"] = ( + abs(G_m1 - G_m2) / G_m1, + abs(G_m2 - G_m1) / G_m2, + G_m1, + G_m2, + ) + return gyds + + @staticmethod + def pc( + member_models=None, + modelutils=None, + com_model=None, + isolate_growths=None, + comm_sol=None, + environment=None, + comm_effects=True, + community=None, + interaction_threshold=0.1, + compatibilized=False, + ): + assert member_models or modelutils or community, ( + "Members must be defined through either < member_models >" + "or < modelutils > or < community >." + ) + member_models = ( + member_models or [mem.model for mem in modelutils] or community.members + ) + if com_model is None: + member_models, com_model = _load_models( + member_models, None, not compatibilized, printing=False + ) + community = community or MSCommunity(com_model, member_models) + if comm_sol is None: + community.util.add_medium(environment) + comm_sol = community.util.model.optimize() + model_utils = modelutils or [MSModelUtil(mem, True) for mem in member_models] + modelutils = [] + for mem in model_utils: + mem.add_medium(environment) + modelutils.append(mem) + if isolate_growths is None: + isolate_growths = {mem.id: mem.model.slim_optimize() for mem in modelutils} + pc_score = comm_sol.objective_value / sum(list(isolate_growths.values())) + if not comm_effects: + return pc_score + + comm_member_growths = { + mem.id: comm_sol.fluxes[mem.primary_biomass.id] for mem in community.members + } + comm_growth_effect = { + memID: nanFilter(comm_environ / isolate_growths[memID]) + for memID, comm_environ in comm_member_growths.items() + } + growth_diffs = array( + [nanFilter(x, False) for x in list(comm_growth_effect.values())] + ) + th_pos, th_neg = 1 + interaction_threshold, 1 - interaction_threshold + if all(growth_diffs > th_pos): + bit = "mutualism" + elif all(growth_diffs < th_neg): + bit = "competitive" + elif ((th_pos > growth_diffs) & (growth_diffs > th_neg)).all(): + bit = "neutral" + elif all(growth_diffs > th_neg) and any(growth_diffs > th_pos): + bit = "commensalism" + elif all(growth_diffs < th_pos) and any(growth_diffs < th_neg): + bit = "amensalism" + elif any(growth_diffs > th_pos) and any(growth_diffs < th_neg): + bit = "parasitism" + else: + print( + f"The relative growths {comm_growth_effect} from {comm_member_growths} coculture and" + f" {isolate_growths} monoculture are not captured." + ) + bit = "" + return (pc_score, comm_growth_effect, comm_member_growths, bit) + + @staticmethod + def bss( + member_models: Iterable = None, + model_utils: Iterable = None, + environments=None, + minMedia=None, + skip_bad_media=False, + ): + def compute_score(minMedia, environment=None, index=0): + minMedia = minMedia or _get_media( + model_s_=[modelUtil.model for modelUtil in model_utils], + environment=environment, + skip_bad_media=skip_bad_media, + ) + model1_media = set( + [ + re.sub(r"(\_\w\d+$)", "", rxnID.replace("EX_", "")) + for rxnID in minMedia[model1_util.id]["media"].keys() + ] + ) + model2_media = set( + [ + re.sub(r"(\_\w\d+$)", "", rxnID.replace("EX_", "")) + for rxnID in minMedia[model2_util.id]["media"].keys() + ] + ) + model1_internal = { + rm_comp(met.id) + for rxn in model1_util.internal_list() + for met in rxn.products + } + model2_internal = { + rm_comp(met.id) + for rxn in model2_util.internal_list() + for met in rxn.products + } + bss_scores[ + f"{model1_util.id} supporting {model2_util.id} in media{index}" + ] = ( + model1_internal, + len(model2_media.intersection(model1_internal)) / len(model2_media), + ) + bss_scores[ + f"{model2_util.id} supporting {model1_util.id} in media{index}" + ] = ( + model2_internal, + len(model1_media.intersection(model2_internal)) / len(model1_media), + ) + + bss_scores = {} + for combination in combinations(model_utils or member_models, 2): + if model_utils is None: + model1_util = MSModelUtil(combination[0], True) + model2_util = MSModelUtil(combination[1], True) + model_utils = [model1_util, model2_util] + else: + model1_util = combination[0] + model2_util = combination[1] + if environments: + for index, environment in enumerate(environments): + compute_score(minMedia, environment, index) + else: + compute_score(minMedia) + return bss_scores + + @staticmethod + def mqs(): + pass + + @staticmethod + def _calculate_jaccard_score(set1, set2): + if set1 == set2: + print(f"The sets are identical, with a length of {len(set1)}.") + if len(set1.union(set2)) == 0: + return (None, None) + return ( + set1.intersection(set2), + len(set1.intersection(set2)) / len(set1.union(set2)), + ) + + @staticmethod + def get_all_genomes_from_ws( + ws_id, + kbase_object=None, + cobrakbase_repo_path: str = None, + kbase_token_path: str = None, + ): + def get_genome(genome_name): + return kbase_object.ws_client.get_objects2( + {"objects": [{"ref": f"{ws_id}/{genome_name}"}]} + )["data"][0]["data"] + + # load the kbase client instance + if not kbase_object: + import os + + os.environ["HOME"] = cobrakbase_repo_path + import cobrakbase + + with open(kbase_token_path) as token_file: + kbase_object = cobrakbase.KBaseAPI(token_file.readline()) + + # calculate the complementarity + genome_list = kbase_object.ws_client.list_objects( + { + "ids": [ws_id], + "type": "KBaseGenomes.Genome", + "minObjectID": 0, + "maxObjectID": 10000, + } + ) + genome_names = [g[1] for g in genome_list if g[1].endswith("RAST")] + return { + genome_name: set( + [ + sso + for j in get_genome(genome_name)["cdss"] + for sso in j["ontology_terms"]["SSO"].keys() + ] + ) + for genome_name in genome_names + } + + @staticmethod + def fs( + models: Iterable = None, + kbase_object=None, + cobrakbase_repo_path: str = None, + kbase_token_path: str = None, + annotated_genomes: dict = None, + printing=False, + ): + if not isinstance(annotated_genomes, dict): + if not kbase_object: + import os + + os.environ["HOME"] = cobrakbase_repo_path + import cobrakbase + + with open(kbase_token_path) as token_file: + kbase_object = cobrakbase.KBaseAPI(token_file.readline()) + annotated_genomes = { + model.id: kbase_object.get_from_ws(model.genome_ref) + for model in models + if hasattr(model, "genome_ref") + } + elif isinstance(annotated_genomes, list): + annotated_genomes = dict( + zip([model.id for model in models], annotated_genomes) + ) + elif models is not None: + annotated_genomes = { + k: v + for k, v in annotated_genomes.items() + if k in [model.id for model in models] + } + genome_combinations = list(combinations(annotated_genomes.keys(), 2)) + if printing: + print( + f"The Functionality Score (FS) will be calculated for {len(genome_combinations)} pairs." + ) + if not isinstance(list(annotated_genomes.values())[0], dict): + genome1_set, genome2_set = set(), set() + distances = {} + for genome1, genome2 in genome_combinations: + for j in annotated_genomes[genome1].features: + for key, val in j.ontology_terms.items(): + if key == "SSO": + genome1_set.update(val) + for j in annotated_genomes[genome2].features: + for key, val in j.ontology_terms.items(): + if key == "SSO": + genome2_set.update(val) + distances[f"{genome1} ++ {genome2}"] = ( + CommScores._calculate_jaccard_score(genome1_set, genome2_set) + ) + else: + distances = { + f"{genome1} ++ {genome2}": CommScores._calculate_jaccard_score( + set( + list(content["SSO"].keys())[0] + for dic in annotated_genomes[genome1]["cdss"] + for x, content in dic.items() + if x == "ontology_terms" and len(content["SSO"].keys()) > 0 + ), + set( + list(content["SSO"].keys())[0] + for dic in annotated_genomes[genome2]["cdss"] + for x, content in dic.items() + if x == "ontology_terms" and len(content["SSO"].keys()) > 0 + ), + ) + for genome1, genome2 in combinations(annotated_genomes.keys(), 2) + } + return distances + + @staticmethod + def smetana( + member_models: Iterable, + environment, + com_model=None, + min_growth=0.1, + n_solutions=100, + abstol=1e-6, + prior_values=None, + compatibilized=False, + sc_coupling=False, + printing=False, + ): + """Quantifies the extent of syntrophy as the sum of all exchanges in a given nutritional environment""" + member_models, community = _load_models( + member_models, com_model, compatibilized == False, printing=printing + ) + sc = None + if not prior_values: + mp = CommScores.mp(member_models, environment, com_model, abstol) + mu = CommScores.mu( + member_models, environment, mp, n_solutions, abstol, compatibilized + ) + if sc_coupling: + sc = CommScores.sc( + member_models, + com_model, + min_growth, + n_solutions, + abstol, + compatibilized, + ) + elif len(prior_values) == 3: + sc, mu, mp = prior_values + else: + mu, mp = prior_values + + smetana_scores = {} + for pairs in combinations(member_models, 2): + for model1, model2 in permutations(pairs): + if model1.id not in smetana_scores: + smetana_scores[model1.id] = {} + if not any([not mu[model1.id], not mp[model1.id]]): + sc_score = 1 if not sc_coupling else sc[model1.id][model2.id] + models_mets = list(model1.metabolites) + list(model2.metabolites) + unique_mets = set([met.id for met in models_mets]) + smetana_scores[model1.id][model2.id] = 0 + for met in models_mets: + if met.id in unique_mets: + mp_score = 0 if met.id not in mp[model1.id] else 1 + smetana_scores[model1.id][model2.id] += ( + mu[model1.id].get(met.id, 0) * sc_score * mp_score + ) + return smetana_scores + + @staticmethod + def antiSMASH(json_path=None, zip_path=None): + # TODO Scores 2, 4, and 5 are being explored for relevance to community formation and reveal specific member interactions/targets + # load the antiSMASH report from either the JSON or the raw ZIP, or both + from os import mkdir, listdir, path + from zipfile import ZipFile + from json import load + + if json_path: + cwd_files = listdir() + if json_path not in cwd_files and zip_path: + with ZipFile(zip_path, "r") as zip_file: + zip_file.extract(json_path) + with open(json_path, "r") as json_file: + data = load(json_file) + elif zip_path: + mkdir("extracted_antiSMASH") + with ZipFile(zip_path, "r") as zip_file: + zip_file.extractall("extracted_antiSMASH") + json_files = [ + x for x in listdir("extracted_antiSMASH") if x.endswith("json") + ] + if len(json_files) > 1: + print( + f"The antiSMASH report describes {len(json_files)} JSON files, the first of which is selected " + f"{json_files[0]} for analysis, otherwise explicitly identify the desired JSON file in the json_path parameter." + ) + with open( + path.join("extracted_antiSMASH", json_files[0]), "r" + ) as json_file: + data = load(json_file) + else: + raise ParameterError( + "Either the json_path or zip_path from the antiSMASH analysis must be provided," + " for these scores to be determined." + ) + # Parse data and scores from the antiSMASH report + biosynthetic_areas = data["records"][0]["areas"] + BGCs = set( + array( + [ + data["records"][0]["areas"][i]["products"] + for i in range(biosynthetic_areas) + ] + ).flatten() + ) + len_proteins = len( + data["records"][0]["modules"]["antismash.modules.clusterblast"][ + "knowncluster" + ]["proteins"] + ) + protein_annotations = [ + data["records"][0]["modules"]["antismash.modules.clusterblast"][ + "knowncluster" + ]["proteins"][i]["annotations"] + for i in range(len_proteins) + ] + clusterBlast = [s for s in protein_annotations if "resistance" in s] + num_clusterBlast = sum( + [item.count("resistance") for item in protein_annotations] + ) + + return ( + biosynthetic_areas, + BGCs, + protein_annotations, + clusterBlast, + num_clusterBlast, + ) diff --git a/modelseedpy/community/commscores_template.html b/modelseedpy/community/commscores_template.html new file mode 100644 index 00000000..b379568a --- /dev/null +++ b/modelseedpy/community/commscores_template.html @@ -0,0 +1,157 @@ + + + + + + CommScores Results + + + + + + + + + + + + + + + +

CommScores Results

+ + + + \ No newline at end of file diff --git a/modelseedpy/community/datastandardization.py b/modelseedpy/community/datastandardization.py new file mode 100644 index 00000000..026d008f --- /dev/null +++ b/modelseedpy/community/datastandardization.py @@ -0,0 +1,1193 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Aug 1 11:44:07 2022 + +@author: Andrew Freiburger +""" +from modelseedpy.community.commhelper import phenotypes +from modelseedpy.core.exceptions import ParameterError +from modelseedpy.core.optlanghelper import isIterable +from modelseedpy.core.fbahelper import FBAHelper +from optlang import Constraint +from optlang.symbolics import Zero +from scipy.constants import hour +from zipfile import ZipFile, ZIP_LZMA +from itertools import chain +from typing import Union, Iterable +from copy import deepcopy + +# from cplex import Cplex +import logging, json, os, re +from pandas import read_csv, DataFrame, ExcelFile +import numpy as np + + +import logging + +logger = logging.getLogger(__name__) + + +def isnumber(string): + try: + float(string) + except: + return False + return True + + +def _findDate(string, numerical=False): + monthNames = [ + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December", + ] + monthNums = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] + days = list(range(31, 0, -1)) # [f"{num}-" for num in list(range(31,0,-1))] + years = list(range(2010, 2025)) + list( + range(10, 25) + ) # [f"-{num}" for num in list(range(2000, 2100))] + americanDates = [ + f"{mon}-{day}-{year}" for mon in monthNums for day in days for year in years + ] + + for date in americanDates: + if re.search(date, string): + month, day, year = date.split("-") + if numerical: + return "-".join([day, month, year]) + return f"{monthNames[int(month)-1][:3]} {day}, {year}" + # # determine the month + # for monName in monthNames: + # if re.search(monName, string): + # month = monName + # break + # if not month: + # for monNum in monthNums: + # if re.search(monNum, string): + # month = monNum # maybe should be converted to the Name for standardization + # # determine the day + # for dayNum in days: + # if re.search(dayNum, string): + # day = dayNum + # break + # # determine the year + # for yearNum in years: + # if re.search(yearNum, string): + # year = yearNum + # break + # return day+month+year + + +def dict_keys_exists(dic, *keys): + if keys[0] in dic: + remainingKeys = keys[1:] + if len(remainingKeys) > 0: + dict_keys_exists(dic[keys[0]], keys[1:]) + return True + return False + + +def find_dic_number(dic): + for k, v in dic.items(): + if isnumber(v): + return v + num = find_dic_number(dic[k]) + return num + + +def default_dict_values(dic, key, default): + return default if not key in dic else dic[key] + + +def trial_contents(short_code, indices_tup, values): + matches = [ele == short_code for ele in indices_tup] + return np.array(values)[matches] + + +def _spreadsheet_extension_load(path): + if ".csv" in path: + return read_csv(path) + elif ".xls" in path: + return ExcelFile(path) + + +def _spreadsheet_extension_parse(path, raw_data, org_sheet): + if ".csv" in path: + return raw_data + elif ".xls" in path: + return raw_data.parse(org_sheet) + + +def _met_id_parser(met): + met_id = re.sub("(\_\w\d+)", "", met) + met_id = met_id.replace("EX_", "", 1) + met_id = met_id.replace("c_", "", 1) + return met_id + + +def _column_reduction(org_df): + dataframe = org_df.copy() # this prevents an irrelevant warning from pandas + dataframe.columns = map(str, dataframe.columns) + dataframe.index = dataframe["Well"] + dataframe.drop("Well", axis=1, inplace=True) + for col in dataframe.columns: + if any([x in col for x in ["Plate", "Well", "Cycle"]]): + dataframe.drop(col, axis=1, inplace=True) + dataframe.columns = list(map(int, list(map(float, dataframe.columns)))) + return dataframe + + +def _remove_trials(org_df, ignore_trials, signal, name, significant_deviation): + # refine the ignore_trials parameter + if isinstance(ignore_trials, dict): + ignore_trials["columns"] = ( + list(map(str, ignore_trials["columns"])) + if "columns" in ignore_trials + else [] + ) + ignore_trials["rows"] = ( + list(map(str, ignore_trials["rows"])) if "rows" in ignore_trials else [] + ) + ignore_trials["wells"] = ( + ignore_trials["wells"] if "wells" in ignore_trials else [] + ) + elif isIterable(ignore_trials): + if ignore_trials[0][0].isalpha() and isnumber(ignore_trials[0][1:]): + short_code = True # TODO - drop trials with respect to the short codes, and not the full codes + + dataframe = org_df.copy() # this prevents an irrelevant warning from pandas + dropped_trials = [] + for trial in dataframe.index: + if ( + isinstance(ignore_trials, dict) + and any( + [ + trial[0] in ignore_trials["rows"], + trial[1:] in ignore_trials["columns"], + trial in ignore_trials["wells"], + ] + ) + or isIterable(ignore_trials) + and trial in ignore_trials + ): + dataframe.drop(trial, axis=0, inplace=True) + dropped_trials.append(trial) + elif isIterable(ignore_trials) and trial in ignore_trials: + dataframe.drop(trial, axis=0, inplace=True) + dropped_trials.append(trial) + removed_trials = [] + if "OD" not in signal: + for trial, row in dataframe.iterrows(): + row_array = np.array(row.to_list()) + ## remove trials for which the biomass growth did not change by the determined minimum deviation + if row_array[-1] / row_array[0] < significant_deviation: + dataframe.drop(trial, axis=0, inplace=True) + removed_trials.append(trial) + if removed_trials: + print( + f"The {removed_trials} trials were removed from the {name} measurements, " + f"with their deviation over time being less than the threshold of {significant_deviation}." + ) + if dropped_trials: + print( + f"The {dropped_trials} trials were dropped from the {name} measurements " + "per the ignore_trials parameter." + ) + return dataframe, dropped_trials + removed_trials + + +def _check_plateau(org_df, signal, name, significant_deviation, timesteps_len): + significant_deviation = max([2, significant_deviation]) + dataframe = org_df.copy() # this prevents an irrelevant warning from pandas + dropped = [] + for trial, row in dataframe.iterrows(): + row_array = np.array(row.to_list()) + values = [] + tracking = False + ## remove trials for which the biomass growth did not change by the determined minimum deviation + for index, val in enumerate(row_array): + if val / row_array[0] >= significant_deviation or tracking: + tracking = True + values.append(val) + if len(values) > timesteps_len: + del values[0] + remaining_values = list(dataframe.columns[index - timesteps_len + 1 :]) + if all( + [ + len(values) == timesteps_len, + values[-1] <= values[0], + remaining_values[0] <= remaining_values[-1] * 1.1, + ] + ): + # the entire plateau, minus the first point of plateau, are removed + dropped = remaining_values + break + if dropped: + break + if dropped: + content = f"{name} {signal}" if name != signal else signal + print( + f"The {dropped} timesteps (with {row_array[index-len(values)+1:]} values) were removed " + f"from the {content} data since the OD plateaued and is no longer valid." + ) + return dropped + + +def _remove_timesteps(org_df, ignore_timesteps, name, signal): + dataframe = org_df.copy() # this prevents an irrelevant warning from pandas + if ignore_timesteps: + dropped = [] + for col in dataframe: + if col in ignore_timesteps: + dataframe.drop(col, axis=1, inplace=True) + dropped.append(col) + if dropped == ignore_timesteps: + print( + f"The ignore_timesteps columns were dropped for the {name} {signal} data." + ) + else: + raise ParameterError( + f"The ignore_timesteps values {ignore_timesteps} " + f"were unsuccessfully dropped for the {name} {signal} data." + ) + return dataframe, ignore_timesteps + + +def _df_construction( + name, + df_name, + ignore_trials, + ignore_timesteps, + significant_deviation, + dataframe, + row_num, + buffer_col1=True, +): + # refine the DataFrames + time_df = _column_reduction(dataframe.iloc[0::2]) + values_df = _column_reduction(dataframe.iloc[1::2]) + # display(name, time_df, values_df) + + # remove specified data trials + if ignore_trials: + values_df, removed_trials = _remove_trials( + values_df, ignore_trials, df_name, name, significant_deviation + ) + for row in removed_trials: + time_df.drop(row, axis=0, inplace=True) + + # remove specified data timesteps + if ignore_timesteps: + values_df, removed_timesteps = _remove_timesteps( + values_df, ignore_timesteps, name, df_name + ) + for col in list(map(int, removed_timesteps)): + time_df.drop(col, axis=1, inplace=True) + + # remove undefined trials + if buffer_col1: + possible_rows = [chr(ord("A") + row) for row in range(1, row_num + 1)] + for trial_code in values_df.index: + if trial_code[0] not in possible_rows: + values_df.drop(trial_code, axis=0, inplace=True) + time_df.drop(trial_code, axis=0, inplace=True) + + # process the data for subsequent operations and optimal efficiency + values_df.astype(str) + time_df.astype(str) + return time_df, values_df + + +def _find_culture(string): + matches = re.findall(r"([A-Z]{2}\+?[A-Z]*)", string) + return [m for m in matches if not any([x in m for x in ["BIOLOG", "III"]])] + + +def reverse_strip_comp(ID): + return ID.replace("~", "-") + + +def _process_csv(self, csv_path, index_col): + self.zipped_output.append(csv_path) + csv = read_csv(csv_path) + csv.index = csv[index_col] + csv.drop(index_col, axis=1, inplace=True) + csv.astype(str) + return csv + + +def add_rel_flux_cons(model, ex, phenoRXN, carbon_ratio, rel_flux=0.2): + # {ex.id}_uptakeLimit: {net_{carbonous_ex}} >= {net_{carbon_source}}*{rel_flux}*{carbon_ratio} + # The negative flux sign of influxes specifies that the carbon_source value must be lesser than the other + # carbon influx that is being constrained. + cons = Constraint(Zero, lb=0, ub=None, name=f"{ex.id}_uptakeLimit") + model.add_cons_vars(cons) + cons.set_linear_coefficients( + { + ex.forward_variable: 1, + ex.reverse_variable: -1, + phenoRXN.forward_variable: -rel_flux * carbon_ratio, + phenoRXN.reverse_variable: rel_flux * carbon_ratio, + } + ) + return model, cons + + +class GrowthData: + + @staticmethod + def process( + community_members: dict, + base_media=None, + solver: str = "glpk", + all_phenotypes=True, + data_paths: dict = None, + species_abundances: str = None, + carbon_conc_series: dict = None, + ignore_trials: Union[dict, list] = None, + ignore_timesteps: list = None, + species_identities_rows=None, + significant_deviation: float = 2, + extract_zip_path: str = None, + determine_requisite_biomass=False, + ): # , msdb_path:str=None): + # define the number of rows in the experimental data + row_num = len(species_identities_rows) + if "rows" in carbon_conc_series and carbon_conc_series["rows"]: + row_num = len(list(carbon_conc_series["rows"].values())[0]) + # load and parse data and metadata + ( + media_conc, + data_timestep_hr, + simulation_time, + dataframes, + trials, + fluxes_df, + ) = GrowthData.load_data( + base_media, + community_members, + solver, + data_paths, + ignore_trials, + all_phenotypes, + ignore_timesteps, + significant_deviation, + row_num, + extract_zip_path, + ) + experimental_metadata, standardized_carbon_conc, trial_name_conversion = ( + GrowthData.metadata( + base_media, + community_members, + species_abundances, + carbon_conc_series, + species_identities_rows, + row_num, + _findDate(data_paths["path"]), + ) + ) + data_df = GrowthData.data_process(dataframes, trial_name_conversion) + requisite_biomass = ( + {} + if not determine_requisite_biomass + else GrowthData.biomass_growth( + carbon_conc_series, + fluxes_df, + data_df.index.unique(), + trial_name_conversion, + data_paths, + community_members if all_phenotypes else None, + ) + ) + return ( + experimental_metadata, + data_df, + fluxes_df, + standardized_carbon_conc, + requisite_biomass, + trial_name_conversion, + np.mean(data_timestep_hr), + simulation_time, + media_conc, + ) + + @staticmethod + def load_data( + base_media, + community_members, + solver, + data_paths, + ignore_trials, + all_phenotypes, + ignore_timesteps, + significant_deviation, + row_num, + extract_zip_path, + min_timesteps=False, + ): + # define default values + significant_deviation = significant_deviation or 0 + data_paths = data_paths or {} + ignore_timesteps = ignore_timesteps or "0:0" + start, end = ignore_timesteps.split(":") + raw_data = _spreadsheet_extension_load(data_paths["path"]) + for org_sheet, name in data_paths.items(): + if org_sheet == "path": + continue + df = _spreadsheet_extension_parse(data_paths["path"], raw_data, org_sheet) + df.columns = df.iloc[6] + df.drop(df.index[:7], inplace=True) + ## acquire the default start and end indices of ignore_timesteps + start = int(start or df.columns[0]) + end = int(end or df.columns[-1]) + break + ignore_timesteps = list(range(start, end + 1)) if start != end else None + if extract_zip_path: + with ZipFile(extract_zip_path, "r") as zp: + zp.extractall() + + # define only species for which data is defined + fluxes_df, comm_members = phenotypes( + community_members, all_phenotypes, solver=solver + ) + modeled_species = list( + v for v in data_paths.values() if ("OD" not in v and " " not in v) + ) + removed_phenotypes = [ + col + for col in fluxes_df + if not any([species in col for species in modeled_species]) + ] + fluxes_df.drop(removed_phenotypes, axis=1, inplace=True) + if removed_phenotypes: + print( + f"The {removed_phenotypes} phenotypes were removed " + f"since their species is not among those with data: {modeled_species}." + ) + + # determine the time range in which all datasets are significant + data_timestep_hr = [] + dataframes = {} + max_timestep_cols = [] + if min_timesteps: + for org_sheet, name in data_paths.items(): + if org_sheet == "path" or "OD" in sheet: + continue + ## define the DataFrame + sheet = org_sheet.replace(" ", "_") + df_name = f"{name}:{sheet}" + dataframes[df_name] = _spreadsheet_extension_parse( + data_paths["path"], raw_data, org_sheet + ) + dataframes[df_name].columns = dataframes[df_name].iloc[6] + dataframes[df_name].drop(dataframes[df_name].index[:7], inplace=True) + ## parse the timesteps from the DataFrame + drop_timestep_range = GrowthData._min_significant_timesteps( + dataframes[df_name], + ignore_timesteps, + significant_deviation, + ignore_trials, + df_name, + name, + ) + max_timestep_cols.append(drop_timestep_range) + ## timesteps that must be dropped for the most restrictive dataset is acquired + max_cols = max(list(map(len, max_timestep_cols))) + for ignore_timesteps in max_timestep_cols: + if len(ignore_timesteps) == max_cols: + break + + # remove trials for which the OD has plateaued + # TODO - this somehow seems to break when the requisite_biomass is ignored + for org_sheet, name in data_paths.items(): + if "OD" not in name: + continue + ## load the OD DataFrame + sheet = org_sheet.replace(" ", "_") + df_name = f"{name}:{sheet}" + dataframes[df_name] = _spreadsheet_extension_parse( + data_paths["path"], raw_data, org_sheet + ) + dataframes[df_name].columns = dataframes[df_name].iloc[6] + dataframes[df_name].drop(dataframes[df_name].index[:7], inplace=True) + ## process the OD DataFrame + data_times_df, data_values_df = _df_construction( + name, + df_name, + ignore_trials, + ignore_timesteps, + significant_deviation, + dataframes[df_name], + row_num, + ) + plateaued_times = _check_plateau( + data_values_df, name, name, significant_deviation, 3 + ) + ## define and store the final DataFrames + for col in plateaued_times: + if col in data_times_df.columns: + data_times_df.drop(col, axis=1, inplace=True) + if col in data_values_df.columns: + data_values_df.drop(col, axis=1, inplace=True) + dataframes[df_name] = (data_times_df, data_values_df) + break + + # refine the non-OD signals + for org_sheet, name in data_paths.items(): + if org_sheet == "path" or "OD" in name: + continue + sheet = org_sheet.replace(" ", "_") + df_name = f"{name}:{sheet}" + if df_name not in dataframes: + dataframes[df_name] = _spreadsheet_extension_parse( + data_paths["path"], raw_data, org_sheet + ) + dataframes[df_name].columns = dataframes[df_name].iloc[6] + dataframes[df_name].drop(dataframes[df_name].index[:7], inplace=True) + # parse the DataFrame for values + simulation_time = dataframes[df_name].iloc[0, -1] / hour + data_timestep_hr.append( + simulation_time / int(dataframes[df_name].columns[-1]) + ) + # define the times and data + data_times_df, data_values_df = _df_construction( + name, + df_name, + ignore_trials, + ignore_timesteps, + significant_deviation, + dataframes[df_name], + row_num, + ) + # display(data_times_df) ; display(data_values_df) + for col in plateaued_times: + if col in data_times_df.columns: + data_times_df.drop(col, axis=1, inplace=True) + if col in data_values_df.columns: + data_values_df.drop(col, axis=1, inplace=True) + dataframes[df_name] = (data_times_df, data_values_df) + + # differentiate the phenotypes for each species + trials = set( + chain.from_iterable( + [list(times.index) for times, values in dataframes.values()] + ) + ) + media_conc = ( + {} + if not base_media + else {cpd.id: cpd.concentration for cpd in base_media.mediacompounds} + ) + return ( + media_conc, + data_timestep_hr, + simulation_time, + dataframes, + trials, + fluxes_df, + ) + + @staticmethod + def _min_significant_timesteps( + full_df, ignore_timesteps, significant_deviation, ignore_trials, df_name, name + ): + # refine the DataFrames + values_df = _column_reduction(full_df.iloc[1::2]) + values_df, removed_trials = _remove_trials( + values_df, ignore_trials, df_name, name, significant_deviation + ) + timestep_range = list(set(list(values_df.columns)) - set(ignore_timesteps)) + start, end = ignore_timesteps[0], ignore_timesteps[-1] + start_index = list(values_df.columns).index(start) + end_index = list(values_df.columns).index(end) + ## adjust the customized range such that the threshold is reached. + for trial, row in values_df.iterrows(): + row_array = np.delete( + np.array(row.to_list()), list(range(start_index, end_index + 1)) + ) + ## remove trials for which the biomass growth did not change by the determined minimum deviation + while all( + [ + row_array[-1] / row_array[0] < significant_deviation, + end <= values_df.columns[-1], + start >= values_df.columns[0], + ] + ): + # print(timestep_range[0], values_df.columns[0], values_df.columns[-1], end, start) + if ( + timestep_range[0] == values_df.columns[0] + and start != values_df.columns[-1] + ): + timestep_range.append(timestep_range[-1] + 1) + start += 1 + print( + f"The end boundary for {name} is increased to {timestep_range[-1]}", + end="\r", + ) + elif ( + timestep_range[-1] == values_df.columns[-1] + and end != values_df.columns[0] + ): + timestep_range.append(timestep_range[0] - 1) + end -= 1 + print( + f"The start boundary for {name} is decreased to {timestep_range[0]}", + end="\r", + ) + else: + raise ParameterError( + f"All of the timesteps were omitted for {name}." + ) + row_array = np.delete( + np.array(row.to_list()), + list( + range( + list(values_df.columns).index(start), + list(values_df.columns).index(end) + 1, + ) + ), + ) + print("\n") + return list(range(start, end + 1)) + + @staticmethod + def metadata( + base_media, + community_members, + species_abundances, + carbon_conc, + species_identities_rows, + row_num, + date, + ): + # define carbon concentrations for each trial + carbon_conc = carbon_conc or {} + carbon_conc["columns"] = default_dict_values(carbon_conc, "columns", {}) + carbon_conc["rows"] = default_dict_values(carbon_conc, "rows", {}) + column_num = len(species_abundances) + + # define the metadata DataFrame and a few columns + constructed_experiments = DataFrame( + index=[f"G{x+1}" for x in list(range(column_num * row_num))] + ) + constructed_experiments.index.name = "short_code" + base_media_path = ( + "minimal components media" if not base_media else base_media.path[0] + ) + constructed_experiments["base_media"] = [base_media_path] * ( + column_num * row_num + ) + + # define community content + # species_mets = {mem["name"]: np.array([mets["consumed"] for mets in mem["phenotypes"].values()]).flatten() + # for mem in community_members.values()} + # define the strains column + strains, additional_compounds, experiment_ids = [], [], [] + trial_name_conversion = {} + count = 1 + ## apply universal values to all trials + base_row_conc = ( + [] + if "*" not in carbon_conc + else [ + ":".join( + [met, str(carbon_conc["*"][met][0]), str(carbon_conc["*"][met][1])] + ) + for met in carbon_conc["*"] + ] + ) + members = list(mem["name"] for mem in community_members.values()) + for row in range(1, row_num + 1): + row_conc = base_row_conc[:] + trial_letter = chr(ord("A") + row) + trial_name_conversion[trial_letter] = {} + ## add rows where the initial concentration in the first trial is non-zero + for met, conc_dict in carbon_conc["rows"].items(): + if conc_dict[sorted(list(conc_dict.keys()))[row - 1]] > 0: + row_conc.append( + ":".join( + [ + met, + str(conc_dict[sorted(list(conc_dict.keys()))[row - 1]]), + str( + conc_dict[ + sorted(list(conc_dict.keys()), reverse=True)[ + -row + ] + ] + ), + ] + ) + ) + + row_concentration = ";".join(row_conc) + composition = {} + for col in range(1, column_num + 1): + ## construct the columns of information + additional_compounds.append(row_concentration) + experiment_id = [] + for member in members: + ### define the relative community abundances + composition[member] = [ + member, + f"r{species_abundances[col][member]}", + ] + ### define the member strain, where it is appropriate + if member in species_identities_rows[row]: + composition[member][ + 0 + ] += f"_{species_identities_rows[row][member]}" + ### the experimental ID is abundance+memberID + if int(composition[member][1][1:]) != 0: + experiment_id.append( + f"{composition[member][1]}_{composition[member][0]}" + ) + composition[member] = ":".join(composition[member]) + strains.append(";".join(composition[member] for member in members)) + # for row2 in row_conc: + # metID, init, end = row2.split(':') + # ### get the met_name for the corresponding match in values + # met_name = None + # for index, mets in enumerate(species_mets.values()): + # if metID in mets: + # met_name = list(species_mets.keys())[index] + # break + # if "met_name" not in locals() or not met_name: + # logger.critical(f"The specified phenotypes {species_mets} for the {members} members" + # f" does not include the consumption of the available sources" + # f" {row_conc}; hence, the model cannot grow.") + # content = "" + # else: + # content = f"{init}_{met_name}" + # experiment_id.append(content) + experiment_id.extend([":".join(row.split(":")[:2]) for row in row_conc]) + experiment_id = "-".join(experiment_id) + experiment_ids.append(experiment_id) + trial_name_conversion[trial_letter][str(col + 1)] = ( + "G" + str(count), + experiment_id, + ) + count += 1 + + # convert the variable concentrations to short codes + standardized_carbon_conc = {} + for met, conc in carbon_conc["rows"].items(): + standardized_carbon_conc[met] = {} + for row, val in conc.items(): + standardized_carbon_conc[met].update( + { + short_code: val + for (short_code, expID) in trial_name_conversion[row].values() + } + ) + for met, conc in carbon_conc["columns"].items(): + standardized_carbon_conc[met] = default_dict_values( + standardized_carbon_conc, met, {} + ) + for col, val in conc.items(): + for row in trial_name_conversion: + standardized_carbon_conc[met][ + trial_name_conversion[row][str(col)][0] + ] = val + + # add columns to the exported dataframe + constructed_experiments.insert(0, "trial_IDs", experiment_ids) + constructed_experiments["additional_compounds"] = additional_compounds + constructed_experiments["strains"] = strains + constructed_experiments["date"] = [date] * (column_num * row_num) + constructed_experiments.to_csv("growth_metadata.tsv", sep="\t") + return constructed_experiments, standardized_carbon_conc, trial_name_conversion + + @staticmethod + def biomass_growth( + carbon_conc, + fluxes_df, + data_df_trials, + trial_name_conversion, + data_paths, + community_members=None, + pheno_info=None, + ): + # TODO - leverage cFBA to partition metabolite consumption between the defined phenotypes + pheno_info = pheno_info or { + f"{content['name']}_{pheno}": mets + for model, content in community_members.items() + for pheno, mets in content["phenotypes"].items() + } + # invert the trial_name_conversion and data_paths keys and values + short_code_trials = { + contents[0]: row + col + for row in trial_name_conversion + for col, contents in trial_name_conversion[row].items() + } + # short_code_trials = {contents[0]:contents[1] for contents in trial_name_conversion[row].values()} + name_signal = {name: signal for signal, name in data_paths.items()} + + # calculate the 90% concentration for each carbon source + requisite_fluxes = {} + for trial in [short_code_trials[ID] for ID in data_df_trials]: + row_letter = trial[0] + col_number = trial[1:] + ## add rows where the initial concentration in the first trial is non-zero + utilized_phenos = {} + food_gradient = carbon_conc.copy() + for dimension, content in food_gradient.items(): + for met, conc_dict in content.items(): + source_conc = conc_dict[ + row_letter if dimension == "rows" else int(col_number) + ] + # print(met, source_conc) + if source_conc == 0 or f"EX_{met}_e0" not in fluxes_df.index: + continue + for pheno, val in fluxes_df.loc[f"EX_{met}_e0"].items(): + # print(pheno, val) + if val < 0: + utilized_phenos[pheno] = source_conc * 0.9 / val + total_consumed = sum(list(utilized_phenos.values())) + # print(utilized_phenos) + + display(fluxes_df) + short_code = trial_name_conversion[row_letter][col_number][0] + requisite_fluxes[short_code] = {} + excreta = {} + for pheno, flux_conversion in utilized_phenos.items(): + species, phenotype = pheno.split("_", 1) + fluxes = ( + fluxes_df.loc[:, pheno] + * abs(flux_conversion) + * abs(flux_conversion / total_consumed) + ) + requisite_fluxes[short_code][f"{species}|{name_signal[species]}"] = ( + fluxes[fluxes != 0] + ) + pheno = reverse_strip_comp(pheno) + if "excreted" in pheno_info[pheno]: + # print(pheno_info[pheno]["excreted"]) + excreta.update( + {met: fluxes.loc[met] for met in pheno_info[pheno]["excreted"]} + ) + ## determine the fluxes for the other members of the community through cross-feeding + participated_species = [] + for pheno, mets in pheno_info.items(): + species, phenotype = pheno.split("_", 1) + if ( + any([species in ph for ph in utilized_phenos]) + or species in participated_species + ): + continue + for met in mets["consumed"]: + exMet = f"EX_{met}_e0" + if exMet not in excreta: + continue + fluxes = ( + abs(excreta[exMet] * 0.99 / fluxes_df.loc[exMet, pheno]) + * fluxes_df.loc[:, pheno] + ) + requisite_fluxes[short_code][ + f"{species}|{name_signal[species]}" + ] = fluxes[fluxes != 0] + participated_species.append(species) + # print(requisite_fluxes) + return requisite_fluxes + + @staticmethod + def data_process(dataframes, trial_name_conversion): + short_codes, trials_list = [], [] + values, times = {}, {} # The times must capture upstream + first = True + for df_name, (times_df, values_df) in dataframes.items(): + # print(df_name) + # display(times_df) ; display(values_df) + times_tup = FBAHelper.parse_df(times_df) + average_times = np.mean(times_tup.values, axis=0) + values[df_name], times[df_name] = [], [] + for trial_code in values_df.index: + row_let, col_num = trial_code[0], trial_code[1:] + # print(trial_code, row_let, col_num) + for trial_row_values in trial_contents( + trial_code, values_df.index, values_df.values + ): + if first: + short_code, experimentalID = trial_name_conversion[row_let][ + col_num + ] + trials_list.extend([experimentalID] * len(values_df.columns)) + short_codes.extend([short_code] * len(values_df.columns)) + values[df_name].extend(trial_row_values) + times[df_name].extend(average_times) + first = False + # process the data to the smallest dataset, to accommodate heterogeneous data sizes + minVal = min(list(map(len, values.values()))) + for df_name, data in values.items(): + values[df_name] = data[:minVal] + times2 = times.copy() + for df_name, data in times2.items(): + times[df_name] = data[:minVal] + # construct the growth DataFrame + df_data = { + "trial_IDs": trials_list[:minVal], + "short_codes": short_codes[:minVal], + } + df_data.update( + {"Time (s)": np.mean(list(times.values()), axis=0)} + ) # element-wise average + df_data.update({df_name: vals for df_name, vals in values.items()}) + data_df = DataFrame(df_data) + data_df.index = data_df["short_codes"] + data_df = data_df.drop(["short_codes"], axis=1) + data_df.to_csv("growth_spectra.tsv", sep="\t") + return data_df + + +class BiologData: + + @staticmethod + def process( + data_paths, + trial_conditions_path, + community_members, + col_row_num, + member_conversions, + culture=None, + date=None, + significant_deviation=None, + solver="glpk", + msdb_path: str = None, + ): + row_num = 8 + column_num = 12 + ( + zipped_output, + data_timestep_hr, + simulation_time, + dataframes, + trials, + culture, + date, + fluxes_df, + ) = BiologData.load_data( + data_paths, + significant_deviation, + community_members, + col_row_num, + row_num, + culture, + date, + solver, + ) + experimental_metadata, standardized_carbon_conc, trial_name_conversion = ( + BiologData.metadata( + trial_conditions_path, row_num, column_num, culture, date + ) + ) + biolog_df = BiologData.data_process(dataframes, trial_name_conversion) + requisite_biomass = BiologData.biomass_growth(biolog_df, member_conversions) + return ( + experimental_metadata, + biolog_df, + fluxes_df, + standardized_carbon_conc, + requisite_biomass, + trial_name_conversion, + np.mean(data_timestep_hr), + simulation_time, + ) + + @staticmethod + def load_data( + data_paths, + significant_deviation, + community_members, + col_row_num, + row_num, + culture, + date, + solver, + ): + zipped_output = [data_paths["path"], "fluxes.tsv"] + # determine the metabolic fluxes for each member and phenotype + # import and parse the raw CSV data + # TODO - this may be capable of emulating leveraged functions from the GrowthData object + fluxes_df = phenotypes(community_members, solver=solver) + # fluxes_df = None + data_timestep_hr = [] + dataframes = {} + raw_data = _spreadsheet_extension_load(data_paths["path"]) + significant_deviation = significant_deviation or 2 + # culture = culture or _find_culture(data_paths['path']) + culture = culture or ",".join( + [ + x + for x in data_paths.values() + if (x not in ["OD"] and not re.search(r"\w\.\w", x)) + ] + ) + date = date or _findDate(data_paths["path"]) + for org_sheet, name in data_paths.items(): + if org_sheet == "path": + continue + sheet = org_sheet.replace(" ", "_") + df_name = f"{name}:{sheet}" + if df_name not in dataframes: + dataframes[df_name] = _spreadsheet_extension_parse( + data_paths["path"], raw_data, org_sheet + ) + dataframes[df_name].columns = dataframes[df_name].iloc[col_row_num] + dataframes[df_name].drop( + dataframes[df_name].index[: col_row_num + 1], inplace=True + ) + dataframes[df_name].dropna(inplace=True) + # parse the DataFrame for values + dataframes[df_name].columns = [ + str(x).strip() for x in dataframes[df_name].columns + ] + simulation_time = dataframes[df_name].iloc[0, -1] / hour + # display(dataframes[df_name]) + data_timestep_hr.append( + simulation_time / int(float(dataframes[df_name].columns[-1])) + ) + # define the times and data + data_times_df, data_values_df = _df_construction( + name, + df_name, + None, + None, + significant_deviation, + dataframes[df_name], + row_num, + False, + ) + # display(data_times_df) ; display(data_values_df) + dataframes[df_name] = (data_times_df, data_values_df) + + # differentiate the phenotypes for each species + trials = set( + chain.from_iterable([list(df.index) for df, times in dataframes.values()]) + ) + return ( + zipped_output, + data_timestep_hr, + simulation_time, + dataframes, + trials, + culture, + date, + fluxes_df, + ) + + @staticmethod + def metadata(trial_conditions_path, row_num, column_num, culture, date): + # define the conditions for each trial + with open(trial_conditions_path) as trials: + trial_conditions = json.load(trials) + + # define the metadata DataFrame and a few columns + constructed_experiments = DataFrame() + ex_prefix = "B" + constructed_experiments.index = [ + f"{ex_prefix}{x+1}" for x in list(range(row_num * column_num)) + ] + constructed_experiments.index.name = "short_code" + + # define the strains column + experiment_ids, trial_names = [], [] + trial_name_conversion, trial_mets = {}, {} + count = 1 + ## apply universal values to all trials + for row in range(row_num): + trial_letter = chr(ord("A") + row) + trial_name_conversion[trial_letter] = {} + ## add rows where the initial concentration in the first trial is non-zero + for col in range(1, column_num + 1): + ## construct the columns of information + dataID = trial_letter + str(col) + MSID = trial_conditions[dataID]["ModelSEED_ID"] + short_code = ex_prefix + str(count) + + experiment_ids.append(MSID) + trial_names.append(trial_conditions[dataID]["name"]) + trial_name_conversion[trial_letter][str(col)] = (short_code, MSID) + trial_mets[MSID] = {short_code: trial_conditions[dataID]["mM"]} + count += 1 + + # add columns to the exported dataframe + constructed_experiments.insert(0, "ModelSEED_ID", experiment_ids) + constructed_experiments.insert(0, "condition", trial_names) + constructed_experiments["strain"] = [culture] * (column_num * row_num) + constructed_experiments["date"] = [date] * (column_num * row_num) + constructed_experiments.to_csv("growth_metadata.tsv", sep="\t") + return constructed_experiments, trial_mets, trial_name_conversion + + @staticmethod + def data_process(dataframes, trial_name_conversion): + short_codes, trials_list = [], [] + values, times = {}, {} # The times must capture upstream + first = True + for df_name, (times_df, values_df) in dataframes.items(): + # display(df_name, times_df, values_df) + times_tup = FBAHelper.parse_df(times_df) + # display(DataFrame(times_tup.values)) + average_times = list(np.mean(times_tup.values, axis=0)) + # print(average_times) + # print(len(average_times)) + values[df_name], times[df_name] = [], [] + for exprID in values_df.index: + row_let, col_num = exprID[0], exprID[1:] + for trial_row_values in trial_contents( + exprID, values_df.index, values_df.values + ): + if first: + short_code, experimentalID = trial_name_conversion[row_let][ + col_num + ] + trials_list.extend([experimentalID] * len(values_df.columns)) + short_codes.extend([short_code] * len(values_df.columns)) + if len(trial_row_values) != len(average_times): + print( + f"The length of the trial data {len(trial_row_values)} " + f"exceeds that of the timesteps {len(average_times)} " + f"which creates an incompatible DataFrame." + ) + values[df_name].extend(trial_row_values) + times[df_name].extend(average_times) + first = False + # process the data to the smallest dataset, to accommodate heterogeneous data sizes + minVal = min(list(map(len, values.values()))) + for df_name, data in values.items(): + values[df_name] = data[:minVal] + times2 = times.copy() + for df_name, data in times2.items(): + times[df_name] = data[:minVal] + df_data = {"trial_IDs": trials_list, "short_codes": short_codes} + df_data.update( + {"Time (s)": list(np.mean(list(times.values()), axis=0))} + ) # element-wise average + df_data.update({df_name: vals for df_name, vals in values.items()}) + biolog_df = DataFrame(df_data) + biolog_df.index = biolog_df["short_codes"] + del biolog_df["short_codes"] + biolog_df.to_csv("growth_spectra.tsv", sep="\t") + + return biolog_df + + @staticmethod + def biomass_growth(biolog_df, member_conversions): + requisite_biomass = {} + for short_code in biolog_df.index.unique(): + requisite_biomass[short_code] = {} + for signal, conversion in member_conversions.items(): + short_code_df = biolog_df[biolog_df.index == short_code] + requisite_biomass[short_code][signal] = ( + conversion + * short_code_df[signal.replace("|", ":").replace(" ", "_")].iloc[-1] + ) + return requisite_biomass diff --git a/modelseedpy/community/get_ncbi_gbff.pl b/modelseedpy/community/get_ncbi_gbff.pl new file mode 100644 index 00000000..cbeddcfc --- /dev/null +++ b/modelseedpy/community/get_ncbi_gbff.pl @@ -0,0 +1,13 @@ +use strict; + +while (<>){ + chomp ($_); + next if ($_=~/^\s*$/); + my $val = `grep $_ assembly_summary_refseq.txt |cut -f 20`; + chomp ($val); + my @p = split ("/", $val); + my $n = $p[-1]; + my $url = "${val}/${n}_genomic.gbff.gz"; + my $fpath = "${n}_genomic.gbff.gz "; + print "curl $url -o $fpath" . "\n"; +} diff --git a/modelseedpy/community/metquest_code.py b/modelseedpy/community/metquest_code.py new file mode 100644 index 00000000..d6ad31e0 --- /dev/null +++ b/modelseedpy/community/metquest_code.py @@ -0,0 +1,1162 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import +from collections import deque, defaultdict +import os +import glob +import sys +import warnings +from itertools import combinations +import re +import pandas as pd +import numpy as np +import cobra +import networkx as nx + +from modelseedpy.community import commhelper +from modelseedpy import MSModelUtil + +warnings.filterwarnings("ignore") + + +def _create_graph_with_internal_reaction(organismsdata): + """ + This function creates a NetworkX DiGraph object which consists of + reactions and metabolites happening inside the organisms in a community. + This makes use of the reaction information i.e., irreversible and + reversible, which is obtained from another script fetch_reactions. + + Parameters + ---------- + organismsdata : dict + Dictionary containing the reaction information about organisms + + Returns + ------- + G : NetworkX DiGraph Object + Bipartite graph consisting of internal reactions in organisms + """ + G = nx.DiGraph() + for modelname in organismsdata: + G.add_nodes_from(organismsdata[modelname]["irreversible_rxn_no"], bipartite=1) + G.add_nodes_from(organismsdata[modelname]["reversible_rxn_no"], bipartite=1) + G.add_nodes_from( + organismsdata[modelname]["reversible_back_rxn_no"], bipartite=1 + ) + irrev_lhs_nodes = list( + set( + [ + item + for sublist in organismsdata[modelname]["irreversible_lhs_nodes"] + for item in sublist + ] + ) + ) + irrev_rhs_nodes = list( + set( + [ + item + for sublist in organismsdata[modelname]["irreversible_rhs_nodes"] + for item in sublist + ] + ) + ) + rev_lhs_nodes = list( + set( + [ + item + for sublist in organismsdata[modelname]["reversible_lhs_nodes"] + for item in sublist + ] + ) + ) + rev_rhs_nodes = list( + set( + [ + item + for sublist in organismsdata[modelname]["reversible_rhs_nodes"] + for item in sublist + ] + ) + ) + G.add_nodes_from(irrev_lhs_nodes, bipartite=0) + G.add_nodes_from(irrev_rhs_nodes, bipartite=0) + G.add_nodes_from(rev_lhs_nodes, bipartite=0) + G.add_nodes_from(rev_rhs_nodes, bipartite=0) + for irrevidx in range(len(organismsdata[modelname]["irreversible_rxn_no"])): + for lhsmetidx in range( + len(organismsdata[modelname]["irreversible_lhs_nodes"][irrevidx]) + ): + G.add_edges_from( + [ + ( + organismsdata[modelname]["irreversible_lhs_nodes"][ + irrevidx + ][lhsmetidx], + organismsdata[modelname]["irreversible_rxn_no"][irrevidx], + ) + ] + ) + for rhsmetidx in range( + len(organismsdata[modelname]["irreversible_rhs_nodes"][irrevidx]) + ): + G.add_edges_from( + [ + ( + organismsdata[modelname]["irreversible_rxn_no"][irrevidx], + organismsdata[modelname]["irreversible_rhs_nodes"][ + irrevidx + ][rhsmetidx], + ) + ] + ) + for revidx in range(len(organismsdata[modelname]["reversible_rxn_no"])): + for lhsmetidxrev in range( + len(organismsdata[modelname]["reversible_lhs_nodes"][revidx]) + ): + G.add_edges_from( + [ + ( + organismsdata[modelname]["reversible_lhs_nodes"][revidx][ + lhsmetidxrev + ], + organismsdata[modelname]["reversible_rxn_no"][revidx], + ) + ] + ) + G.add_edges_from( + [ + ( + organismsdata[modelname]["reversible_back_rxn_no"][revidx], + organismsdata[modelname]["reversible_lhs_nodes"][revidx][ + lhsmetidxrev + ], + ) + ] + ) + for rhsmetidxrev in range( + len(organismsdata[modelname]["reversible_rhs_nodes"][revidx]) + ): + G.add_edges_from( + [ + ( + organismsdata[modelname]["reversible_rxn_no"][revidx], + organismsdata[modelname]["reversible_rhs_nodes"][revidx][ + rhsmetidxrev + ], + ) + ] + ) + G.add_edges_from( + [ + ( + organismsdata[modelname]["reversible_rhs_nodes"][revidx][ + rhsmetidxrev + ], + organismsdata[modelname]["reversible_back_rxn_no"][revidx], + ) + ] + ) + return G + + +def _create_graph_with_exchange_reactions(G, orgs, namemap): + """ + This function first identifies the common exchange metabolites + and the non-common exchange metabolites and adds them to the + DiGraph object generated above. + + Parameters + ---------- + G : NetworkX DiGraph Object + Bipartite graph of reaction network from organisms + orgs : dict + Dictionary consisting of irreversible, reversible and exchange + reactions pertaining to the organisms. If more than one organism + is used, this dictionary consists of information about all the + organisms. + namemap : dict + Dictionary mapping the adhoc reaction names to reaction names in + the model + + Returns + ------- + G : NetworkX DiGraph Object + Bipartite graph consisting of internal and exchange reactions in organisms + namemap : dict + Dictionary mapping the adhoc exchange reaction names to reaction names in + the model + """ + metabolite_exchanged = [] + for orgnames in orgs: + exc_met = orgs[orgnames]["exchange_metab_nodes"] + metabolite_exchanged.append(exc_met) + # Common exchange metabolites in different organisms + common_exchange_metabolite = list( + set.intersection(*list(map(set, metabolite_exchanged))) + ) + common_exchange_metabolite.sort() + # Adding the common exchange metabolites to the graph + for orgnames in orgs: + renamed_exc_met = [ + f"{orgnames} {comexcmet}" for comexcmet in common_exchange_metabolite + ] + number_exc_met = list(range(0, len(common_exchange_metabolite))) + mod_exc_rxn_number = [ + f"Org_{orgnames} ER{str(num + 1)}" for num in number_exc_met + ] + mod_exc_rev_rxn_number = [ + f"Org_{orgnames} ERR{str(num + 1)}" for num in number_exc_met + ] + G.add_nodes_from(mod_exc_rxn_number, bipartite=1) + G.add_nodes_from(mod_exc_rev_rxn_number, bipartite=1) + G.add_nodes_from(common_exchange_metabolite, bipartite=0) + G.add_nodes_from(renamed_exc_met, bipartite=0) + for k in range(len(renamed_exc_met)): + namemap[mod_exc_rxn_number[k]] = common_exchange_metabolite[k] + namemap[mod_exc_rev_rxn_number[k]] = common_exchange_metabolite[k] + G.add_edges_from([(renamed_exc_met[k], mod_exc_rxn_number[k])]) + G.add_edges_from([(mod_exc_rxn_number[k], common_exchange_metabolite[k])]) + G.add_edges_from( + [(common_exchange_metabolite[k], mod_exc_rev_rxn_number[k])] + ) + G.add_edges_from([(mod_exc_rev_rxn_number[k], renamed_exc_met[k])]) + # Adding the uncommon exchange metabolites to the graph + for orgnames in orgs: + metitems = orgs[orgnames]["exchange_metab_nodes"] + non_common_exc_met = list(set(metitems) - set(common_exchange_metabolite)) + non_common_exc_met.sort() + renamed_non_common_exc_met = [f"{orgnames} {s}" for s in non_common_exc_met] + number_non_common_exc_met = list(range(0, len(non_common_exc_met))) + mod_non_common_exc_rxn_number = [ + f"Org_{orgnames} NCER{str(num + 1)}" for num in number_non_common_exc_met + ] + mod_non_common_exc_rev_rxn_number = [ + f"Org_{orgnames} NCERR{str(num + 1)}" for num in number_non_common_exc_met + ] + G.add_nodes_from(mod_non_common_exc_rxn_number, bipartite=1) + G.add_nodes_from(mod_non_common_exc_rev_rxn_number, bipartite=1) + G.add_nodes_from(non_common_exc_met, bipartite=0) + G.add_nodes_from(renamed_non_common_exc_met, bipartite=0) + for k in range(len(renamed_non_common_exc_met)): + namemap[mod_non_common_exc_rxn_number[k]] = non_common_exc_met[k] + namemap[mod_non_common_exc_rev_rxn_number[k]] = non_common_exc_met[k] + G.add_edges_from( + [(renamed_non_common_exc_met[k], mod_non_common_exc_rxn_number[k])] + ) + G.add_edges_from( + [(mod_non_common_exc_rxn_number[k], non_common_exc_met[k])] + ) + G.add_edges_from( + [(non_common_exc_met[k], mod_non_common_exc_rev_rxn_number[k])] + ) + G.add_edges_from( + [(mod_non_common_exc_rev_rxn_number[k], renamed_non_common_exc_met[k])] + ) + return G, namemap + + +def create_graph(file_names, no_of_orgs): + """ + This function creates bipartite graph of the organisms based on the + path provided and the number of organsisms. For instance, if a folder + has 3 model files, and the number of organisms is 2, 3 (3C2) different + bipartite graphs are created. The graph objects and the dictionary + are saved as gpickle and pickle files respectively. + + Parameters + ---------- + file_names : list + List containing the file names of models + no_of_orgs : int + Number of organisms to be used for creating the DiGraph. + + Returns + ------- + H : NetworkX DiGraph Object + Bipartite graph consisting of internal and exchange reactions in organisms + full_name_map : dict + Dictionary mapping the adhoc reaction names to reaction names in + the model + """ + + H = [] + organisms_reaction_data, partial_name_map = segregate_reactions_from_models( + file_names + ) + if organisms_reaction_data: + organisms_names = list(organisms_reaction_data.keys()) + all_possible_combis = list( + combinations(list(range(len(organisms_names))), int(no_of_orgs)) + ) + if int(no_of_orgs) > 1 and sorted(organisms_names)[0][0] == "0": + all_possible_combis = all_possible_combis[: len(organisms_names) - 1] + if all_possible_combis: + for ncom in range(len(all_possible_combis)): + file_name = "" + current_combination = {} + for numincom in range(len(all_possible_combis[ncom])): + current_combination[ + organisms_names[all_possible_combis[ncom][numincom]] + ] = organisms_reaction_data[ + organisms_names[all_possible_combis[ncom][numincom]] + ] + file_name = ( + file_name + + organisms_names[all_possible_combis[ncom][numincom]] + + "_" + ) + H.append(_create_graph_with_internal_reaction(current_combination)) + temp, full_name_map = _create_graph_with_exchange_reactions( + H[ncom], current_combination, partial_name_map + ) + H[ncom] = temp + print(len(H), H[ncom]) + print("Number of edges in graph", len(H[ncom].edges())) + print("Number of nodes in graph", len(H[ncom].nodes())) + + # Uncomment the following code to save the graph files externally in your machine + # Note: Graph files can occupy a large space for large datasets + """ + if os.access(path_name_with_models, os.W_OK): + with open(file_name + 'namemap' + '.pickle', 'wb') as filetodump: + dump(full_name_map, filetodump) + nx.write_gpickle(H[ncom], file_name + '.gpickle') + print('Graph and namemap saved for file(s) in', path_name_with_models) + """ + else: + print( + "Number of organisms for creating a consortium graph is more than the models given" + ) + print("Program will now exit") + sys.exit() + else: + print("Cannot create graph") + sys.exit() + return H, full_name_map + + +def forward_pass(graph_object, media): + """ + This function carries out the Guided Breadth First Search on a directed + bipartite graph starting from the entries in seed metabolite set. + + Parameters + ---------- + graph_object : NetworkX DiGraph Object + Bipartite graph of the metabolic network + + seedmet : set + Set of seed metabolites including the source + + Returns + ------- + lower_bound_metabolite : defaultdict + Minimum number of steps required to reach a metabolite + status_dict : defaultdict + Dictionary pertaining to the status of every reaction - whether it + has been visited or not + scope : set + Set of metabolites that can be produced from the given set of + seed metabolites + + Notes + ----- + Starting with the set of seed metabolites S, the algorithm first finds + all the reactions from the set R, whose precursor metabolites are in S. + Such reactions are marked visited and added to the visited reaction set. + Metabolites produced by these reactions are checked. The reactions where + these metabolites participate are then checked for the presence of all its + predecessors and are added to the queue. This traversal continues in a + breadth-first manner and stops when there are no further reactions to + be visited. + """ + pred = graph_object.predecessors + succ = graph_object.successors + lower_bound_metabolite = {cpd: [0] for cpd in media} + lower_bound_reaction = defaultdict(list) + status_dict = defaultdict(str) + # Using a deque since deques have O(1) speed for appendleft() and popleft() + # while lists have O(n) performance for inserting and popping. + queue = deque([]) + # All seed metabolites are always present, hence require 0 steps + stage = 1 + mediaMets = list(media.keys()) + scope = list(media.keys()) + starting_rxn_node = [] + # First stage where starting_rxn_node list contains all the reactions + # which require only the seed metabolites as input + for starting_met_nodes in mediaMets: + # Essential when analysing mutiple networks with same seed metabolite + # set, although would be redundant in case of single network + if starting_met_nodes in graph_object: + for startingrxns in succ(starting_met_nodes): + if set(pred(startingrxns)).issubset(mediaMets): + if startingrxns not in starting_rxn_node: + starting_rxn_node.append(startingrxns) + for metsprod in succ(startingrxns): + scope.add(metsprod) + if stage not in lower_bound_metabolite[metsprod]: + lower_bound_metabolite[metsprod].append(stage) + if stage not in lower_bound_reaction[startingrxns]: + lower_bound_reaction[startingrxns].append(stage) + for rxn in starting_rxn_node: + for metabs in succ(rxn): + for nextrxn in succ(metabs): + if set(pred(nextrxn)).issubset(scope): + if nextrxn not in queue: + queue.append(nextrxn) + status_dict[rxn] = "V" + while queue: + stage += 1 + for parentrxn in list(queue): + if status_dict[parentrxn] == "": + if stage not in lower_bound_reaction[parentrxn]: + lower_bound_reaction[parentrxn].append(stage) + for mets in succ(parentrxn): + scope.add(mets) + if stage not in lower_bound_metabolite[mets]: + lower_bound_metabolite[mets].append(stage) + for progeny in succ(mets): + if set(pred(progeny)).issubset(scope): + if status_dict[progeny] != "V": + if progeny not in queue: + queue.append(progeny) + status_dict[parentrxn] = "V" + elif status_dict[parentrxn] == "V": + for mets in succ(parentrxn): + if stage not in lower_bound_metabolite[mets]: + lower_bound_metabolite[mets].append(stage) + queue.popleft() + return lower_bound_metabolite, status_dict, scope + + +def find_different_reaction_types(stoi_matrix, model, current_model_name): + """ + This function finds the exchange, irreversible and the reversible reactions + from the model. + + Parameters + ---------- + stoi_matrix : numpy array + full path name where the model files are + model : COBRA model object + COBRA model object created from SBML models + current_model_name : str + Name which is to be prefixed against every + reaction/metabolite (to differentiate the entries in multiple organisms, + when a community model is built) + Returns + ------- + exchange_met_ids : list + Metabolite identifiers of exchange metabolites + irrev_lhs_nodes : list + Metabolite identifiers of reactants of irreversible reactions + irrev_rhs_nodes : list + Metabolite identifiers of products of irreversible reactions + rev_lhs_nodes : list + Metabolite identifiers of reactants of reversible reactions + rev_rhs_nodes : list + Metabolite identifiers of products of reversible reactions + exchange_rxn_ids : list + Reaction identifers of exchange reactions + irrev_rxn_ids : list + Reaction identifiers of irreversible reactions + rev_rxn_ids : list + Reaction identifiers of reversible reactions + + """ + + xdim = np.shape(stoi_matrix) + reactants_of_reaction, total_metabolites_in_reaction, products_of_reaction = ( + [], + [], + [], + ) + number_of_reactants_in_reaction, total_number_of_metabs_in_reaction = [], [] + number_of_products_in_reaction, exchange_reaction_idx = [], [] + reaction_identifiers, reaction_in_model, metabolite_identifiers = [], [], [] + for metab in model.metabolites: + metabolite_identifiers.append(metab.id) + for rxns in model.reactions: + reaction_identifiers.append(rxns.id) + reaction_in_model.append(rxns.reaction) + for rxnidx in range(xdim[0]): + reactants_of_reaction.append(np.where(stoi_matrix[rxnidx] == -1)) + total_metabolites_in_reaction.append(np.where(stoi_matrix[rxnidx] != 0)) + products_of_reaction.append(np.where(stoi_matrix[rxnidx] == 1)) + number_of_reactants_in_reaction.append(len(reactants_of_reaction[rxnidx][0])) + total_number_of_metabs_in_reaction.append( + len(total_metabolites_in_reaction[rxnidx][0]) + ) + number_of_products_in_reaction.append(len(products_of_reaction[rxnidx][0])) + + # Case 1 - Presence of bulk metabolites in the medium + + if ( + reaction_in_model[rxnidx][-1] == "b" + ): # Assuming the bulk metabolites end in 'b' + if ( + number_of_reactants_in_reaction[rxnidx] == 1 + and number_of_products_in_reaction[rxnidx] == 1 + ): + exchange_reaction_idx.append(rxnidx) + # Case 2 - Presence of exchange metabolites + elif ( + number_of_reactants_in_reaction[rxnidx] == 1 + and total_number_of_metabs_in_reaction[rxnidx] == 1 + ): + exchange_reaction_idx.append(rxnidx) + elif ( + number_of_products_in_reaction[rxnidx] == 1 + and total_number_of_metabs_in_reaction[rxnidx] == 1 + ): + exchange_reaction_idx.append(rxnidx) + exchange_met_ids, exchange_met_index, exchange_rxn_ids = [], [], [] + for excentry in exchange_reaction_idx: + exchange_rxn_ids.append(reaction_identifiers[excentry]) + if reaction_in_model[excentry][-1] == "b": + exchange_met_ids.append( + metabolite_identifiers[np.nonzero(stoi_matrix[excentry])[0][0]] + ) + else: + exchange_met_index.append(np.nonzero(stoi_matrix[excentry])[0].tolist()[0]) + if exchange_met_index: + for metind in exchange_met_index: + exchange_met_ids.append(metabolite_identifiers[metind]) + all_rxn_idx = list(range(len(reaction_in_model))) + internal_rxns = list(set(all_rxn_idx) ^ set(exchange_reaction_idx)) + reversible_rxns, irreversible_rxns, rxns_lowerbound, rxns_upperbound = ( + [], + [], + [], + [], + ) + for rxns in model.reactions: + rxns_lowerbound.append(rxns.lower_bound) + rxns_upperbound.append(rxns.upper_bound) + for idxint in internal_rxns: + if rxns_lowerbound[idxint] < 0 and rxns_upperbound[idxint] >= 0: + reversible_rxns.append(idxint) + elif rxns_lowerbound[idxint] >= 0 and rxns_upperbound[idxint] >= 0: + irreversible_rxns.append(idxint) + # Irreversible reaction nodes + ( + irrev_lhs_temporary, + irrev_rhs_temporary, + irrev_lhs_nodes, + irrev_rhs_nodes, + irrev_rxn_ids, + ) = ([], [], [], [], []) + for irridx in irreversible_rxns: + irrev_rxn_ids.append(reaction_identifiers[irridx]) + irrev_lhs_temporary.append(np.where(stoi_matrix[irridx] < 0)[0].tolist()) + irrev_rhs_temporary.append(np.where(stoi_matrix[irridx] > 0)[0].tolist()) + for lhsirridx in range(len(irrev_lhs_temporary)): + temp_metab_list_lhs = [] + for met_idx_lhs in irrev_lhs_temporary[lhsirridx]: + met_namech_lhs = ( + f"{current_model_name} {metabolite_identifiers[met_idx_lhs]}" + ) + temp_metab_list_lhs.append(met_namech_lhs) + irrev_lhs_nodes.append(temp_metab_list_lhs) + for rhsirridx in range(len(irrev_rhs_temporary)): + temp_metab_list_rhs = [] + for met_idx_rhs in irrev_rhs_temporary[rhsirridx]: + met_namech_rhs = ( + f"{current_model_name} {metabolite_identifiers[met_idx_rhs]}" + ) + temp_metab_list_rhs.append(met_namech_rhs) + irrev_rhs_nodes.append(temp_metab_list_rhs) + + # Reversible reaction nodes + rev_lhs_temporary, rev_rhs_temporary, rev_lhs_nodes, rev_rhs_nodes, rev_rxn_ids = ( + [], + [], + [], + [], + [], + ) + for rridx in reversible_rxns: + rev_rxn_ids.append(reaction_identifiers[rridx]) + rev_lhs_temporary.append(np.where(stoi_matrix[rridx] < 0)[0].tolist()) + rev_rhs_temporary.append(np.where(stoi_matrix[rridx] > 0)[0].tolist()) + for lhsrevidx in range(len(rev_lhs_temporary)): + temp_metab_list_lhs_rev = [] + for met_idx_lhs in rev_lhs_temporary[lhsrevidx]: + met_namech_lhs = "%s %s" % ( + current_model_name, + metabolite_identifiers[met_idx_lhs], + ) + temp_metab_list_lhs_rev.append(met_namech_lhs) + rev_lhs_nodes.append(temp_metab_list_lhs_rev) + for rhsrevidx in range(len(rev_rhs_temporary)): + temp_metab_list_rhs_rev = [] + for met_idx_rhs in rev_rhs_temporary[rhsrevidx]: + met_namech_rhs = "%s %s" % ( + current_model_name, + metabolite_identifiers[met_idx_rhs], + ) + temp_metab_list_rhs_rev.append(met_namech_rhs) + rev_rhs_nodes.append(temp_metab_list_rhs_rev) + return ( + exchange_met_ids, + irrev_lhs_nodes, + irrev_rhs_nodes, + rev_lhs_nodes, + rev_rhs_nodes, + exchange_rxn_ids, + irrev_rxn_ids, + rev_rxn_ids, + ) + + +def segregate_reactions_from_models(models): + """ + This function gets the data pertaining to the reactions and the + metabolites from the models of multiple organisms. + This requires as input the pathname where the '.xml' files are located. + From this path, this function reads all the files using the functions + in the COBRA toolbox and generates the stoichiometric model for these + SBML models. + + Parameters + ---------- + models : list + List of model objects + + Returns + ------- + all_organisms_info : dict + Dictionary of all model data (reaction information about all the + organisms) + namemap : dict + Dictionary mapping the adhoc reaction names to reaction names in + the model + + """ + all_organisms_info = {} + namemap = {} + for model in models: + stoi = cobra.util.array.create_stoichiometric_matrix(model) + current_organisms_info = {} + rxns_in_model, mets_in_model = [], [] + for metab in model.metabolites: + mets_in_model.append(metab.id) + for reac in model.reactions: + rxns_in_model.append(reac.id) + stoi_matrix = stoi.T + ( + exchange_nodes, + irrev_lhs_nodes, + irrev_rhs_nodes, + rev_lhs_nodes, + rev_rhs_nodes, + exc_name, + irrev_rxn_name, + rev_rxn_name, + ) = find_different_reaction_types(stoi_matrix, model, model.id) + current_organisms_info[model.id] = { + "exchange_metab_nodes": exchange_nodes, + "irreversible_lhs_nodes": irrev_lhs_nodes, + "irreversible_rhs_nodes": irrev_rhs_nodes, + "reversible_lhs_nodes": rev_lhs_nodes, + "reversible_rhs_nodes": rev_rhs_nodes, + "exch_rxn_name": exc_name, + "irrev_rxn_name": irrev_rxn_name, + "rev_rxn_name": rev_rxn_name, + } + + irrev_rxn_number = [] + for num in range(len(irrev_lhs_nodes)): + modified_name_irrev = f"Org_{model.id} IR" + str(num + 1) + irrev_rxn_number.append(modified_name_irrev) + namemap[modified_name_irrev] = irrev_rxn_name[num] + + rev_rxn_number = [] + for num in range(len(rev_lhs_nodes)): + modified_name_rev = f"Org_{model.id} RR" + str(num + 1) + rev_rxn_number.append(modified_name_rev) + namemap[modified_name_rev] = rev_rxn_name[num] + + rev_back_rxn_number = [] + for num in range(len(rev_lhs_nodes)): + modified_name_back_rev = f"Org_{model.id} RevBR" + str(num + 1) + rev_back_rxn_number.append(modified_name_back_rev) + namemap[modified_name_back_rev] = rev_rxn_name[num] + + current_organisms_info[model.id]["reversible_rxn_no"] = rev_rxn_number + current_organisms_info[model.id]["irreversible_rxn_no"] = irrev_rxn_number + current_organisms_info[model.id]["total_nodes"] = ( + len(exchange_nodes) + len(irrev_lhs_nodes) + len(rev_lhs_nodes) + ) + current_organisms_info[model.id]["model_rxns"] = rxns_in_model + current_organisms_info[model.id]["reversible_back_rxn_no"] = rev_back_rxn_number + current_organisms_info[model.id]["metabolites"] = mets_in_model + all_organisms_info.update(current_organisms_info) + return all_organisms_info, namemap + + +def find_relievedrxns(model, org_info, org_info_pert): + relieved = { + i: list(set(org_info_pert[i]) - set(org_info[i])) for i in org_info_pert + } + detailed_rel_rxns, rel_rxns_name = {}, {} + + for i in model: + j = i.id + detailed_rel_rxns[j] = [] + rel_rxns_name[j] = [] + if len(relieved[j]): + rxn_ids = [] + for r in i.reactions: + rxn_ids.append(r.id) + for rel in relieved[j]: + rel_rxn = i.reactions[rxn_ids.index(rel)].reaction + detailed_rel_rxns[j].append(rel_rxn) + rel_rxns_name[j].append(i.reactions[rxn_ids.index(rel)].name) + + return relieved, detailed_rel_rxns, rel_rxns_name + + +def find_stuckrxns(model, community, media, no_of_orgs): + # Constructing graphs + warnings.filterwarnings("ignore") + G, full_name_map = create_graph(community, no_of_orgs) + if not os.path.exists("results"): + os.makedirs("results") + all_possible_combis = list( + combinations(list(range(len(community))), int(no_of_orgs)) + ) + if no_of_orgs > 1 and sorted(community)[0][0] == "0": + all_possible_combis = all_possible_combis[: len(community) - 1] + org_info = {} + scope = {} + print("No. of graphs constructed: ", len(G)) + + # This loop finds all the stuck reaction + for i in range(len(all_possible_combis)): + lbm, sd, s = forward_pass(G[i], media) + for j in range(len(all_possible_combis[i])): + stuck, rxnNode = [], [] + model1 = model[all_possible_combis[i][j]].id + visited = list(sd.keys()) + for r in G[i].nodes: + if r.find(model1) >= 0: + rxnNode.append(r) + for rxn in rxnNode: + if rxn in visited: + continue + elif rxn.find("ERR") >= 0: + continue + elif rxn.find("Org") >= 0: + if (rxn[len(model1) + 5] == "I") or (rxn[len(model1) + 5] == "R"): + stuck.append(rxn) + org_info[model1] = stuck + scope[model1] = s + return org_info, scope, full_name_map + + +def decrypt_orginfo(org_info, namemap): + """ + This function decrypts the rxn ids using the data in corresponding namemaps + :param org_info: + :param namemap: + :return: + org_info: An dictionary of decrypted rxn ids for each community + """ + for i in org_info: + for j in range(len(org_info[i])): + org_info[i][j] = namemap[org_info[i][j]] + return org_info + + +def make_perturbed_community(rem_org, pert_models, pert_community): + pert_model_ids = [i.id for i in pert_models] + for i in rem_org: + if i in pert_model_ids: + pert_models.remove(pert_models[pert_model_ids.index(i)]) + pert_community.remove(pert_community[pert_model_ids.index(i)]) + pert_model_ids.remove(i) + + return pert_models, pert_community, pert_model_ids + + +def perform_task( + media, model, transport_rxns, pert_community, org_info_wo_trans_rxn, rem_org_list, n +): + org_info_pert, scope_pert, namemap_pert = find_stuckrxns( + model, pert_community, media, len(pert_community) + ) + org_info_pert = decrypt_orginfo(org_info_pert, namemap_pert) + org_info_pert_wo_trans_rxn = { + i: list(set(org_info_pert[i]) - set(transport_rxns)) for i in org_info_pert + } + + with open(f"results/Community_without_clus{str(n)}.csv", "w") as g: + for m in org_info_pert_wo_trans_rxn: + g.write(m + "," + str(len(org_info_pert_wo_trans_rxn[m])) + "\n") + stuck_com = stuck_pert_com = 0 + for i in org_info_wo_trans_rxn: + if i not in rem_org_list: + stuck_com += len(org_info_wo_trans_rxn[i]) + for i in org_info_pert_wo_trans_rxn: + stuck_pert_com += len(org_info_pert_wo_trans_rxn[i]) + msi = 1 - (stuck_com / stuck_pert_com) + print(n, "th cluster") + return org_info_pert, org_info_pert_wo_trans_rxn, msi + + +def write_relieved_rxns(g, relieved, detailed_rel_rxns, rel_rxns_name): + g.write("acceptor\trelieved reactions\n") + for i in relieved: + g.write(i + "\t") + for j in list(set(relieved[i])): + g.write(j + "\t\n\t") + for d in list(set(rel_rxns_name[i])): + g.write(d + "\t\n\t") + for k in list(set(detailed_rel_rxns[i])): + g.write(k + "\t\n") + + +def write_relieved_rxn_metadata(h, org_info_wo_trans_rxn, org_info_pert_wo_trans_rxn): + nrelieved = {} + for i in org_info_pert_wo_trans_rxn: + nrelieved[i] = len(org_info_pert_wo_trans_rxn[i]) - len( + org_info_wo_trans_rxn[i] + ) + if nrelieved[i]: + h.write( + i + + "," + + str(len(org_info_wo_trans_rxn[i])) + + "," + + str(len(org_info_pert_wo_trans_rxn[i])) + + "," + + str(nrelieved[i]) + + "\n" + ) + + +def find_relieved_rxn(model, media_name, org_info_single, org_info_pair): + """ + This function extracts and writes the relieved rxns into a tsv file + :param model: + :param media_name: name of the media used (identifer to know what media is used when analysis is done using multiple media) + :param org_info_single: Dictionary containing stuck reactions of all microbes in the community + :param org_info_pair: Dictionary containing stuck reactions of all microbes in the community + :return: None + """ + relieved = {} + for org1 in model: + for org2 in model: + if org1.id + "_" + org2.id in org_info_pair.keys(): + relieved[org1.id + "_" + org2.id] = [] + temp = list( + set(org_info_single[org1.id + "_" + org1.id]) + - set(org_info_pair[org1.id + "_" + org2.id]) + ) + for j in temp: + relieved[org1.id + "_" + org2.id].append(j) + else: + continue + + rel_rxns_name, detailed_rel_rxns = {}, {} + for i in model: + rxn_ids = [r.id for r in i.reactions] + for j in model: + org1 = i.id + org2 = j.id + if org1 + "_" + org2 in relieved.keys(): + detailed_rel_rxns[org1 + "_" + org2] = [] + rel_rxns_name[org1 + "_" + org2] = [] + for rel in relieved[org1 + "_" + org2]: + rel_rxn = i.reactions[rxn_ids.index(rel)].reaction + detailed_rel_rxns[org1 + "_" + org2].append(rel_rxn) + rel_rxns_name[org1 + "_" + org2].append( + i.reactions[rxn_ids.index(rel)].name + ) + + relieved_rxn_output_file = f"results/relieved_rxns_{media_name}_w_excrxns.tsv" + with open(relieved_rxn_output_file, "w") as g: + header = "acceptor\tdonor\trelieved reactions\n" + g.write(header) + for i in model: + for j in model: + org1 = i.id + org2 = j.id + if org1 + "_" + org2 in relieved.keys(): + g.write(org1 + "\t" + org2 + "\t") + rel_rxns = list(set(relieved[org1 + "_" + org2])) + det_rel_rxns = list(set(detailed_rel_rxns[org1 + "_" + org2])) + rel_rxn_nam = list(set(rel_rxns_name[org1 + "_" + org2])) + for x in rel_rxns: + g.write(x + "\t\n\t\t") + for d in rel_rxn_nam: + g.write(d + "\t\n\t\t") + for k in det_rel_rxns: + g.write(k + "\t\n") + print("relieved reactions are written at:\n", relieved_rxn_output_file) + + +def find_stuck_rxns(models, community, media, comm_size): + """ + Constructs graphs using MetQuest and finds all stuck reactions in the cellular compartment + :param models: list of GEMs + :param community: the community model + :param seedmet_file: path to txt file containing seed metabolites + :param comm_size: number of organisms in a community + :return: + org_info: Dictionary containing stuck reactions of all microbes in the community + scope: Dictionary containing all the metabolites that can be produced by the microbes in the community + namemap: Dictionaru containing all the decrypted rxn ids + """ + warnings.filterwarnings("ignore") + G, full_name_map = create_graph(community, comm_size) + if not os.path.exists("results"): + os.makedirs("results") + + all_possible_combis = combinations(models, comm_size) + org_info, scope, vis = {}, {}, {} + print("No. of graphs constructed: ", len(G)) + + # This loop finds all the stuck reaction + for i in range(len(all_possible_combis)): + lbm, sd, s = forward_pass(G[i], media) + for j in range(len(all_possible_combis[i])): + stuck, rxnNode = [], [] + model1 = models[all_possible_combis[i][j]].id + visited = list(sd.keys()) + for r in G[i].nodes: + if r.find(model1) >= 0: + rxnNode.append(r) + for rxn in rxnNode: + if rxn in visited or rxn.find("ERR") >= 0: + continue + elif rxn.find("Org") >= 0: + if (rxn[len(model1) + 5] == "I") or (rxn[len(model1) + 5] == "R"): + stuck.append(rxn) + model2 = models[all_possible_combis[i][j - 1]].id + org_info[model1 + "_" + model2] = stuck + scope[model1 + "_" + model2] = s + vis[model1 + "_" + model2] = visited + return org_info, scope, full_name_map, vis + + +def decrypt_org_info(org_info, namemap): + """ + This function decrypts the rxn ids using the data in corresponding namemaps + :param org_info: + :param namemap: + :return: + org_info: An dictionary of decrypted rxn ids for each community + """ + for i in org_info: + for j in range(len(org_info[i])): + org_info[i][j] = namemap[org_info[i][j]] + return org_info + + +def pMSI(models, media): + """ + Calculates MSI for CarveMe models + Extracts and writes relieved reactions in every pair + :param community: list of GSMM files + :param sd_file: path to txt file containing seed metabolites + :return: msi: Dictionary containing MSI values for every pair + """ + # find all transport reactions + community_model = commhelper.build_from_species_models(models) + comm_util = MSModelUtil(community_model) + # find stuck reactions + org_info_single, scope_sin, namemap_sin, vis = find_stuck_rxns( + models, community_model, media, 1 + ) + org_info_pair, scope_pair, namemap_pair, vis = find_stuck_rxns( + models, models, media, 2 + ) + # decrypt the stuck reactions + org_info_single = decrypt_org_info(org_info_single, namemap_sin) + org_info_pair = decrypt_org_info(org_info_pair, namemap_pair) + # Filter out the transport reactions from every stuck reaction list + org_info_single_wo_trans_rxn, org_info_pair_wo_trans_rxn = {}, {} + for i in org_info_single: + org_info_single_wo_trans_rxn[i] = list( + set(org_info_single[i]) - set(comm_util.transport_list()) + ) + for i in org_info_pair: + org_info_pair_wo_trans_rxn[i] = list( + set(org_info_pair[i]) - set(comm_util.transport_list()) + ) + # find all the relieved reactions in every pairs + find_relieved_rxn(models, "relieved_rxns", org_info_single, org_info_pair) + # calculate MSI for every pair + msi = {} + for org1 in models: + stuck_A = len(org_info_single_wo_trans_rxn[org1.id + "_" + org1.id]) + for org2 in models: + if org1.id + "_" + org2.id in org_info_pair_wo_trans_rxn.keys(): + stuck_AUB = len(org_info_pair_wo_trans_rxn[org1.id + "_" + org2.id]) + if stuck_A == 0: + msi[org1.id + "_" + org2.id] = 0 + else: + msi[org1.id + "_" + org2.id] = 1 - (stuck_AUB / stuck_A) + return msi, community_model + + +def calculate_pairwiseMSI(models, media): + """ + This function calculates pairwise-MSI for all given microbes. + + Creates a csv file containing the MSI values of all pairs. + + Creates an tsv file containing the list of reaction relieved + in all acceptor microbes in the presence of corresponding donor microbes. + + :param path: path to all xml files + :param sd_file: path to txt file containing seed metabolites + """ + + warnings.filterwarnings("ignore") + msi, community_model = pMSI(models, media) + msi_output_file = f"results/MSI_{os.path.basename(media).replace('.txt', '')}.csv" + with open(msi_output_file, "w") as f: + header = "organism,in_the_presence,msi_value\n" + f.write(header) + for org1, org2 in combinations(models, 2): + if org1.id + "_" + org2.id in msi.keys(): + f.write(f"{org1.id},{org2.id},{str(msi[org1.id + '_' + org2.id])}\n") + print("MSI values are written at:\n", msi_output_file) + + +def calculate_higherorderMSI(models, media, clusters="individual_clusters"): + community_model = commhelper.build_from_species_models(models) + comm_util = MSModelUtil(community_model) + org_info, scope, namemap = find_stuckrxns(model, community, media, len(community)) + org_info = decrypt_orginfo(org_info, namemap) + org_info_wo_trans_rxn = { + i: list(set(org_info[i]) - set(comm_util.transport_list())) for i in org_info + } + + with open(f"results/community_unperturbed.csv", "w") as f: + for i, diff in org_info_wo_trans_rxn.items(): + f.write(i + "," + str(len(diff)) + "\n") + + if clusters == "individual_clusters": + rem_org_list1, rem_org_list2 = {}, {} + for i, model in enumerate(models): + rem_org_list1[i] = model.id + rem_org_list2[i] = model.id + else: + cluster_data = pd.read_csv(clusters, sep=",") + rem_org_list1 = cluster_data.set_index("Cluster").T.to_dict("list") + for n in rem_org_list1: + rem_org_list1[n] = [j for j in rem_org_list1[n] if pd.isna(j) is False] + for n in rem_org_list1: + rem_org_list1[n] = [ + cobra.io.read_sbml_model(i).id for i in rem_org_list1[n] + ] + # rem_org_list1[n] = [model_ids[model_ids.index(i)] for i in rem_org_list1[n]] + rem_org_list2 = rem_org_list1.copy() + + for nclus in rem_org_list2: + rem_org_list2[nclus] = [x.replace(".xml", "") for x in rem_org_list2[nclus]] + + with open(f"results/higher_order_msi.csv", "w") as f: + for n in rem_org_list1: + # os.chdir(path) + # new_models = model.copy() + # new_community = glob.glob('*.xml') + # if not new_community: + # new_community = glob.glob('*.sbml') + # new_community.sort() + + pert_models, pert_community, pert_model_ids = make_perturbed_community( + rem_org_list1[n], new_models, new_community + ) + + org_info_pert, org_info_pert_wo_trans_rxn, msi = perform_task( + media, + pert_models, + transport_rxns, + pert_community, + org_info_wo_trans_rxn, + rem_org_list2[n], + n, + ) + for i in rem_org_list2[n]: + f.write("Comm,clus_" + str(n) + "#" + i + "," + str(msi) + "\n") + + if msi: + relieved, detailed_rel_rxns, rel_rxns_name = find_relievedrxns( + pert_models, org_info, org_info_pert + ) + with open( + f"results/clusterKO_/data_analysis/relieved_rxns_Comm--clus{n}.tsv", + "w", + ) as g: + write_relieved_rxns(g, relieved, detailed_rel_rxns, rel_rxns_name) + with open( + f"results/clusterKO_/data_analysis/Comm--clus{n}.tsv", "w" + ) as h: + h.write("Comm--clus" + str(n) + "\n") + for i in rem_org_list2[n]: + h.write(i + "\n") + h.write( + "num of rxns relieved in the below orgs in the presence of clust" + + str(n) + + "\n" + ) + h.write("org,unpert,clust_" + str(n) + "KO,rxns relieved\n") + write_relieved_rxn_metadata( + h, org_info_wo_trans_rxn, org_info_pert_wo_trans_rxn + ) + print("Comm--clus" + str(n)) + + new_models = model.copy() + new_community = glob.glob("*.xml") + if not new_community: + new_community = glob.glob("*.sbml") + new_community.sort() + ko_models, ko_community, model_ids = make_perturbed_community( + pert_model_ids, new_models, new_community + ) + ko_org_list = [x for x in pert_model_ids] + if len(ko_org_list) < len(model): + org_info_pert, org_info_pert_wo_trans_rxn, msi = perform_task( + media, + ko_models, + transport_rxns, + ko_community, + org_info_wo_trans_rxn, + ko_org_list, + n, + ) + for i in ko_community: + f.write("clus_" + str(n) + "#" + i + ",Comm," + str(msi) + "\n") + + if msi: + relieved, detailed_rel_rxns, rel_rxns_name = find_relievedrxns( + ko_models, org_info, org_info_pert + ) + with open( + f"results/clusterKO_/data_analysis/relieved_rxns_Comm--clus{n}.tsv", + "w", + ) as g: + write_relieved_rxns( + g, relieved, detailed_rel_rxns, rel_rxns_name + ) + with open( + f"results/clusterKO_/data_analysis/Comm{n}--clus.tsv", "w" + ) as h: + h.write("clus" + str(n) + "--Comm\n") + for i in ko_org_list: + h.write(i + "\n") + h.write( + "num of rxns relieved in the below orgs in the presence of Comm" + ) + h.write("org,unpert,commKO,rxns relieved\n") + write_relieved_rxn_metadata( + h, org_info_wo_trans_rxn, org_info_pert_wo_trans_rxn + ) + print("clus" + str(n) + "--Comm") diff --git a/modelseedpy/community/mscommfitting.py b/modelseedpy/community/mscommfitting.py new file mode 100644 index 00000000..2a42d63d --- /dev/null +++ b/modelseedpy/community/mscommfitting.py @@ -0,0 +1,1091 @@ +# -*- coding: utf-8 -*- +from modelseedpy.fbapkg.mspackagemanager import MSPackageManager +from modelseedpy.core.exceptions import FeasibilityError +from pandas import read_table, read_csv, DataFrame +from optlang import Variable, Constraint, Objective, Model +from modelseedpy.core.fbahelper import FBAHelper +from scipy.constants import hour +from scipy.optimize import newton +from collections import OrderedDict +from zipfile import ZipFile, ZIP_LZMA +from optlang.symbolics import Zero +from sympy.core.add import Add +from matplotlib import pyplot + +# from pprint import pprint +from time import sleep, process_time +import numpy as np + +# from cplex import Cplex +import json, os, re + + +def _name(name, suffix, time, trial): + return "-".join([name + suffix, time, trial]) + + +class MSCommFitting: + def __init__(self): + ( + self.parameters, + self.variables, + self.constraints, + self.dataframes, + self.signal_species, + self.values, + ) = ({}, {}, {}, {}, {}, {}) + self.phenotypes_parsed_df: np.ndarray + self.problem: object + self.species_phenotypes_bool_df: object + self.zipped_output, self.plots = [], [] + + def _process_csv(self, csv_path, index_col): + self.zipped_output.append(csv_path) + csv = read_csv(csv_path) + csv.index = csv[index_col] + csv.drop(index_col, axis=1, inplace=True) + csv.astype(str) + return csv + + def load_data( + self, + community_members: dict = {}, + kbase_token: str = None, + solver: str = "glpk", + signal_tsv_paths: dict = {}, + phenotypes_csv_path: str = None, + media_conc_path: str = None, + species_abundance_path: str = None, + carbon_conc_series: dict = {}, + ignore_trials: dict = {}, + ignore_timesteps: list = [], + significant_deviation: float = 2, + zip_path: str = None, + ): + self.zipped_output = [] + if zip_path: + with ZipFile(zip_path, "r") as zp: + zp.extractall() + if species_abundance_path: + self.species_abundances = self._process_csv( + species_abundance_path, "trial_column" + ) + if phenotypes_csv_path: + # process a predefined exchanges table + self.zipped_output.append(phenotypes_csv_path) + fluxes_df = read_csv(phenotypes_csv_path) + fluxes_df.index = fluxes_df["rxn"] + to_drop = [col for col in fluxes_df.columns if " " in col] + for col in to_drop + ["rxn"]: + fluxes_df.drop(col, axis=1, inplace=True) + print( + f'The {to_drop+["rxn"]} columns were dropped from the phenotypes CSV.' + ) + + # import and process the media concentrations CSV + self.media_conc = self._process_csv(media_conc_path, "media_compound") + elif community_members: + # import the media for each model + models = OrderedDict() + ex_rxns: set = set() + species: dict = {} + # Using KBase media to constrain exchange reactions in model + for model, content in community_members.items(): + model.solver = solver + ex_rxns.update(model.exchanges) + species.update({content["name"]: content["phenotypes"].keys()}) + models[model] = [] + for media in content["phenotypes"].values(): + with model: # !!! Is this the correct method of parameterizing a media for a model? + pkgmgr = MSPackageManager.get_pkg_mgr(model) + pkgmgr.getpkg("KBaseMediaPkg").build_package( + media, default_uptake=0, default_excretion=1000 + ) + models[model].append(model.optimize()) + + # construct the parsed table of all exchange fluxes for each phenotype + fluxes_df = DataFrame( + data={ + "bio": [ + sol.fluxes["bio1"] + for solutions in models.values() + for sol in solutions + ] + }, + columns=["rxn"] + + [ + spec + "-" + phenotype + for spec, phenotypes in species.items() + for phenotype in phenotypes + ] + + [spec + "-stationary" for spec in species.keys()], + ) + fluxes_df.index.name = "rxn" + fluxes_df.drop("rxn", axis=1, inplace=True) + for ex_rxn in ex_rxns: + elements = [] + for model, solutions in models.items(): + for sol in solutions: + elements.append( + sol.fluxes[ex_rxn] if ex_rxn in sol.fluxes else 0 + ) + if any(np.array(elements) != 0): + fluxes_df.iloc[ex_rxn.id] = elements + + # define only species for which data is defined + signal_tsv_paths + modeled_species = list(signal_tsv_paths.values()) + modeled_species.remove("OD") + removed_phenotypes = [ + col + for col in fluxes_df + if not any([species in col for species in modeled_species]) + ] + for col in removed_phenotypes: + fluxes_df.drop(col, axis=1, inplace=True) + if removed_phenotypes != []: + print( + f"The {removed_phenotypes} phenotypes were removed since their species is not among those that are defined with data: {modeled_species}." + ) + fluxes_df.astype(str) + self.phenotypes_parsed_df = FBAHelper.parse_df(fluxes_df) + self.species_phenotypes_bool_df = DataFrame( + columns=self.phenotypes_parsed_df[1] + ) + + if "columns" not in carbon_conc_series: + carbon_conc_series["columns"] = {} + if "rows" not in carbon_conc_series: + carbon_conc_series["rows"] = {} + self.carbon_conc = carbon_conc_series + + self.parameters["data_timestep_hr"] = [] + if "columns" not in ignore_trials: + ignore_trials["columns"] = [] + if "rows" not in ignore_trials: + ignore_trials["rows"] = [] + if "wells" not in ignore_trials: + ignore_trials["wells"] = [] + ignore_trials["columns"] = list(map(str, ignore_trials["columns"])) + ignore_trials["rows"] = list(map(str, ignore_trials["rows"])) + ignore_timesteps = list(map(str, ignore_timesteps)) + for path, name in signal_tsv_paths.items(): + self.zipped_output.append(path) + signal = os.path.splitext(path)[0].split("_")[0] + # define the signal dataframe + self.signal_species[signal] = name # {name:phenotypes} + self.dataframes[signal] = read_table(path) + self.simulation_time = self.dataframes[signal].iloc[0, -1] / hour + self.parameters["data_timestep_hr"].append( + self.simulation_time / int(self.dataframes[signal].columns[-1]) + ) + self.dataframes[signal] = self.dataframes[signal].iloc[ + 1::2 + ] # excludes the times + self.dataframes[signal].index = self.dataframes[signal]["Well"] + # filter data contents + dropped_trials = [] + for trial in self.dataframes[signal].index: + if any( + [ + trial[0] in ignore_trials["rows"], + trial[1:] in ignore_trials["columns"], + trial in ignore_trials["wells"], + ] + ): + self.dataframes[signal].drop(trial, axis=0, inplace=True) + dropped_trials.append(trial) + if dropped_trials != []: + print( + f"The {dropped_trials} trials were dropped from the {name} measurements." + ) + for col in ["Plate", "Cycle", "Well"]: + self.dataframes[signal].drop(col, axis=1, inplace=True) + for col in self.dataframes[signal]: + if col in ignore_timesteps: + self.dataframes[signal].drop(col, axis=1, inplace=True) + if "OD" not in signal: + removed_trials = [] + for trial, row in self.dataframes[signal].iterrows(): + row_array = np.array(row.to_list()) + if row_array[-1] / row_array[0] < significant_deviation: + self.dataframes[signal].drop(trial, axis=0, inplace=True) + removed_trials.append(trial) + if removed_trials != []: + print( + f"The {removed_trials} trials were removed from the {name} measurements, with their deviation over time being less than the threshold of {significant_deviation}." + ) + + # process the data for subsequent operations and optimal efficiency + self.dataframes[signal].astype(str) + self.dataframes[signal]: np.ndarray = FBAHelper.parse_df( + self.dataframes[signal] + ) + + # differentiate the phenotypes for each species + if "OD" not in signal: + self.species_phenotypes_bool_df.loc[signal]: np.ndarray[int] = np.array( + [ + 1 if self.signal_species[signal] in pheno else 0 + for pheno in self.phenotypes_parsed_df[1] + ] + ) + + self.parameters["data_timestep_hr"] = sum( + self.parameters["data_timestep_hr"] + ) / len(self.parameters["data_timestep_hr"]) + self.data_timesteps = int( + self.simulation_time / self.parameters["data_timestep_hr"] + ) + + def define_problem( + self, + parameters={}, + zip_name: str = None, + export_parameters: bool = True, + export_lp: bool = True, + final_relative_carbon_conc: float = None, + metabolites_to_track: list = None, + ): + self.parameters.update( + { + "timestep_hr": self.parameters[ + "data_timestep_hr" + ], # Timestep size of the simulation in hours + "cvct": 1, # Coefficient for the minimization of phenotype conversion to the stationary phase. + "cvcf": 1, # Coefficient for the minimization of phenotype conversion from the stationary phase. + "bcv": 1, # This is the highest fraction of biomass for a given species that can change phenotypes in a single time step + "cvmin": 0, # This is the lowest value the limit on phenotype conversion goes, + "v": 1000, # the kinetics constant that is externally adjusted + "carbon_sources": ["cpd00136", "cpd00179"], # 4hb, maltose + "diffpos": 1, + "diffneg": 1, # objective coefficients to the diffpos and diffneg variables that correspond with the components of difference between experimental and predicted bimoass values + } + ) + self.parameters.update(parameters) + self.problem = Model() + print("Solver:", type(self.problem)) + trial: str + time: str + name: str + phenotype: str + met: str + obj_coef = {} + constraints: list = [] + variables: list = ( + [] + ) # lists are orders-of-magnitude faster than numpy arrays for appending + self.simulation_timesteps = list( + map( + str, + range( + 1, int(self.simulation_time / self.parameters["timestep_hr"]) + 1 + ), + ) + ) + time_1 = process_time() + for signal, parsed_df in self.dataframes.items(): + for met in self.phenotypes_parsed_df[0]: + met_id = re.sub("(\_\w\d+)", "", met) + met_id = met_id.replace("EX_", "", 1) + if ( + not metabolites_to_track + and met_id != "cpd00001" + or metabolites_to_track + and met_id in metabolites_to_track + ): + self.variables["c_" + met] = {} + self.constraints["dcc_" + met] = {} + initial_time = True + final_time = False + for time in self.simulation_timesteps: + if time == self.simulation_timesteps[-1]: + final_time = True + self.variables["c_" + met][time] = {} + self.constraints["dcc_" + met][time] = {} + for trial in parsed_df[0]: + # define biomass measurement conversion variables + self.variables["c_" + met][time][trial] = Variable( + _name("c_", met, time, trial), lb=0, ub=1000 + ) + # constrain initial time concentrations to the media or a large number if it is not explicitly defined + if ( + initial_time and not "bio" in met_id + ): # !!! the value of initial_time changes + initial_val = ( + self.media_conc.at[met_id, "mM"] + if met_id in list(self.media_conc.index) + else 100 + ) + if ( + met_id in self.carbon_conc["rows"] + and trial[0] in self.carbon_conc["rows"][met_id] + ): + initial_val = self.carbon_conc["rows"][met_id][ + trial[0] + ] + if ( + met_id in self.carbon_conc["columns"] + and trial[1:] in self.carbon_conc["columns"][met_id] + ): + initial_val = self.carbon_conc["columns"][met_id][ + trial[1:] + ] + self.variables["c_" + met][time][trial] = Variable( + _name("c_", met, time, trial), + lb=initial_val, + ub=initial_val, + ) + # mandate complete carbon consumption + if ( + final_time + and met_id in self.parameters["carbon_sources"] + ): + self.variables["c_" + met][time][trial] = Variable( + _name("c_", met, time, trial), lb=0, ub=0 + ) + if final_relative_carbon_conc: + self.variables["c_" + met][time][trial] = Variable( + _name("c_", met, time, trial), + lb=0, + ub=self.variables["c_" + met]["1"][trial].lb + * final_relative_carbon_conc, + ) + variables.append(self.variables["c_" + met][time][trial]) + initial_time = False + break # prevents duplicated variables + for signal, parsed_df in self.dataframes.items(): + if "OD" not in signal: + for phenotype in self.phenotypes_parsed_df[1]: + if self.signal_species[signal] in phenotype: + self.constraints["dbc_" + phenotype] = {} + for time in self.simulation_timesteps: + self.constraints["dbc_" + phenotype][time] = {} + + for phenotype in self.phenotypes_parsed_df[1]: + self.variables["cvt_" + phenotype] = {} + self.variables["cvf_" + phenotype] = {} + self.variables["b_" + phenotype] = {} + self.variables["g_" + phenotype] = {} + self.variables["v_" + phenotype] = {} + self.constraints["gc_" + phenotype] = {} + self.constraints["cvc_" + phenotype] = {} + for time in self.simulation_timesteps: + self.variables["cvt_" + phenotype][time] = {} + self.variables["cvf_" + phenotype][time] = {} + self.variables["b_" + phenotype][time] = {} + self.variables["g_" + phenotype][time] = {} + self.variables["v_" + phenotype][time] = {} + self.constraints["gc_" + phenotype][time] = {} + self.constraints["cvc_" + phenotype][time] = {} + for trial in parsed_df[0]: + self.variables["b_" + phenotype][time][ + trial + ] = Variable( # predicted biomass abundance + _name("b_", phenotype, time, trial), lb=0, ub=100 + ) + self.variables["g_" + phenotype][time][ + trial + ] = Variable( # biomass growth + _name("g_", phenotype, time, trial), lb=0, ub=1000 + ) + + if "stationary" not in phenotype: + self.variables["cvt_" + phenotype][time][ + trial + ] = Variable( # conversion rate to the stationary phase + _name("cvt_", phenotype, time, trial), lb=0, ub=100 + ) + self.variables["cvf_" + phenotype][time][ + trial + ] = Variable( # conversion from to the stationary phase + _name("cvf_", phenotype, time, trial), lb=0, ub=100 + ) + + # 0 <= -cvt + bcv*b_{phenotype} + cvmin + self.constraints["cvc_" + phenotype][time][trial] = Constraint( + -self.variables["cvt_" + phenotype][time][trial] + + self.parameters["bcv"] + * self.variables["b_" + phenotype][time][trial] + + self.parameters["cvmin"], + lb=0, + ub=None, + name=_name("cvc_", phenotype, time, trial), + ) + + # g_{phenotype} - b_{phenotype}*v = 0 + self.constraints["gc_" + phenotype][time][trial] = Constraint( + self.variables["g_" + phenotype][time][trial] + - self.parameters["v"] + * self.variables["b_" + phenotype][time][trial], + lb=0, + ub=0, + name=_name("gc_", phenotype, time, trial), + ) + + obj_coef.update( + { + self.variables["cvf_" + phenotype][time][ + trial + ]: self.parameters["cvcf"], + self.variables["cvt_" + phenotype][time][ + trial + ]: self.parameters["cvct"], + } + ) + variables.extend( + [ + self.variables["cvf_" + phenotype][time][trial], + self.variables["cvt_" + phenotype][time][trial], + ] + ) + constraints.extend( + [ + self.constraints["cvc_" + phenotype][time][trial], + self.constraints["gc_" + phenotype][time][trial], + ] + ) + + variables.extend( + [ + self.variables["b_" + phenotype][time][trial], + self.variables["g_" + phenotype][time][trial], + ] + ) + + # define non-concentration variables + half_dt = self.parameters["data_timestep_hr"] / 2 + time_2 = process_time() + print(f"Done with biomass loop: {(time_2-time_1)/60} min") + for parsed_df in self.dataframes.values(): + for r_index, met in enumerate(self.phenotypes_parsed_df[0]): + met_id = re.sub("(\_\w\d+)", "", met) + met_id = met_id.replace("EX_", "", 1) + if ( + not metabolites_to_track + and "cpd00001" != met_id + or metabolites_to_track + and met_id in metabolites_to_track + ): + for trial in parsed_df[0]: + last_column = False + for time in self.simulation_timesteps: + next_time = str(int(time) + 1) + if next_time == self.simulation_timesteps[-1]: + last_column = True + # c_{met} + dt*sum_k^K() - c+1_{met} = 0 + self.constraints["dcc_" + met][time][trial] = Constraint( + self.variables["c_" + met][time][trial] + - self.variables["c_" + met][next_time][trial] + + np.dot( + self.phenotypes_parsed_df[2][r_index] * half_dt, + np.array( + [ + self.variables["g_" + phenotype][time][ + trial + ] + + self.variables["g_" + phenotype][ + next_time + ][trial] + for phenotype in self.phenotypes_parsed_df[ + 1 + ] + ] + ), + ), + ub=0, + lb=0, + name=_name("dcc_", met, time, trial), + ) + + constraints.append( + self.constraints["dcc_" + met][time][trial] + ) + if last_column: + break + break # prevents duplicated constraints + + time_3 = process_time() + print(f"Done with metabolites loop: {(time_3-time_2)/60} min") + for signal, parsed_df in self.dataframes.items(): + data_timestep = 1 + self.variables[signal + "__conversion"] = Variable( + signal + "__conversion", lb=0, ub=1000 + ) + variables.append(self.variables[signal + "__conversion"]) + + self.variables[signal + "__bio"] = {} + self.variables[signal + "__diffpos"] = {} + self.variables[signal + "__diffneg"] = {} + self.constraints[signal + "__bioc"] = {} + self.constraints[signal + "__diffc"] = {} # diffc is defined latter + for time in self.simulation_timesteps: + if ( + int(time) * self.parameters["timestep_hr"] + >= data_timestep * self.parameters["data_timestep_hr"] + ): # synchronizes user timesteps with data timesteps + data_timestep += 1 + if int(data_timestep) > self.data_timesteps: + break + next_time = str(int(time) + 1) + self.variables[signal + "__bio"][time] = {} + self.variables[signal + "__diffpos"][time] = {} + self.variables[signal + "__diffneg"][time] = {} + self.constraints[signal + "__bioc"][time] = {} + self.constraints[signal + "__diffc"][time] = {} + for r_index, trial in enumerate(parsed_df[0]): + total_biomass: Add = 0 + signal_sum: Add = 0 + from_sum: Add = 0 + to_sum: Add = 0 + for phenotype in self.phenotypes_parsed_df[1]: + total_biomass += self.variables["b_" + phenotype][time][ + trial + ] + val = ( + 1 + if "OD" in signal + else self.species_phenotypes_bool_df.loc[ + signal, phenotype + ] + ) + signal_sum += ( + val * self.variables["b_" + phenotype][time][trial] + ) + if all( + [ + "OD" not in signal, + self.signal_species[signal] in phenotype, + "stationary" not in phenotype, + ] + ): + from_sum += ( + val + * self.variables["cvf_" + phenotype][time][trial] + ) + to_sum += ( + val + * self.variables["cvt_" + phenotype][time][trial] + ) + for phenotype in self.phenotypes_parsed_df[1]: + if ( + "OD" not in signal + and self.signal_species[signal] in phenotype + ): + if "stationary" in phenotype: + # b_{phenotype} - sum_k^K(es_k*cvf) + sum_k^K(pheno_bool*cvt) - b+1_{phenotype} = 0 + self.constraints["dbc_" + phenotype][time][ + trial + ] = Constraint( + self.variables["b_" + phenotype][time][trial] + - from_sum + + to_sum + - self.variables["b_" + phenotype][next_time][ + trial + ], + ub=0, + lb=0, + name=_name("dbc_", phenotype, time, trial), + ) + else: + # -b_{phenotype} + dt*g_{phenotype} + cvf - cvt - b+1_{phenotype} = 0 + self.constraints["dbc_" + phenotype][time][ + trial + ] = Constraint( + self.variables["b_" + phenotype][time][trial] + - self.variables["b_" + phenotype][next_time][ + trial + ] + + half_dt + * ( + self.variables["g_" + phenotype][time][ + trial + ] + + self.variables["g_" + phenotype][ + next_time + ][trial] + ) + + self.variables["cvf_" + phenotype][time][ + trial + ] + - self.variables["cvt_" + phenotype][time][ + trial + ], + ub=0, + lb=0, + name=_name("dbc_", phenotype, time, trial), + ) + + constraints.append( + self.constraints["dbc_" + phenotype][time][trial] + ) + + self.variables[signal + "__bio"][time][trial] = Variable( + _name(signal, "__bio", time, trial), lb=0, ub=1000 + ) + self.variables[signal + "__diffpos"][time][trial] = Variable( + _name(signal, "__diffpos", time, trial), lb=0, ub=100 + ) + self.variables[signal + "__diffneg"][time][trial] = Variable( + _name(signal, "__diffneg", time, trial), lb=0, ub=100 + ) + + # {signal}__conversion*datum = {signal}__bio + self.constraints[signal + "__bioc"][time][trial] = Constraint( + self.variables[signal + "__conversion"] + * parsed_df[2][r_index, int(data_timestep) - 1] + - self.variables[signal + "__bio"][time][trial], + name=_name(signal, "__bioc", time, trial), + lb=0, + ub=0, + ) + + # {speces}_bio - sum_k^K(es_k*b_{phenotype}) - {signal}_diffpos + {signal}_diffneg = 0 + self.constraints[signal + "__diffc"][time][trial] = Constraint( + self.variables[signal + "__bio"][time][trial] + - signal_sum + - self.variables[signal + "__diffpos"][time][trial] + + self.variables[signal + "__diffneg"][time][trial], + name=_name(signal, "__diffc", time, trial), + lb=0, + ub=0, + ) + + obj_coef.update( + { + self.variables[signal + "__diffpos"][time][ + trial + ]: self.parameters["diffpos"], + self.variables[signal + "__diffneg"][time][ + trial + ]: self.parameters["diffneg"], + } + ) + variables.extend( + [ + self.variables[signal + "__bio"][time][trial], + self.variables[signal + "__diffpos"][time][trial], + self.variables[signal + "__diffneg"][time][trial], + ] + ) + constraints.extend( + [ + self.constraints[signal + "__bioc"][time][trial], + self.constraints[signal + "__diffc"][time][trial], + ] + ) + + time_4 = process_time() + print(f"Done with the dbc & diffc loop: {(time_4-time_3)/60} min") + # construct the problem + self.problem.add(variables) + self.problem.update() + self.problem.add(constraints) + self.problem.update() + self.problem.objective = Objective(Zero, direction="min") # , sloppy=True) + self.problem.objective.set_linear_coefficients(obj_coef) + time_5 = process_time() + print( + f"Done with loading the variables, constraints, and objective: {(time_5-time_4)/60} min" + ) + + # print contents + if export_parameters: + self.zipped_output.append("parameters.csv") + DataFrame( + data=list(self.parameters.values()), + index=list(self.parameters.keys()), + columns=["values"], + ).to_csv("parameters.csv") + if export_lp: + self.zipped_output.extend(["mscommfitting.lp", "mscommfitting.json"]) + with open("mscommfitting.lp", "w") as lp: + lp.write(self.problem.to_lp()) + with open("mscommfitting.json", "w") as lp: + json.dump(self.problem.to_json(), lp, indent=3) + if zip_name: + self.zip_name = zip_name + sleep(2) + with ZipFile(self.zip_name, "w", compression=ZIP_LZMA) as zp: + for file in self.zipped_output: + zp.write(file) + os.remove(file) + + time_6 = process_time() + print(f"Done exporting the content: {(time_6-time_5)/60} min") + + def compute(self, graphs: list = [], zip_name=None): + solution = self.problem.optimize() + # categorize the primal values by trial and time + for variable, value in self.problem.primal_values.items(): + if "conversion" not in variable: + basename, time, trial = variable.split("-") + time = int(time) * self.parameters["data_timestep_hr"] + if not trial in self.values: + self.values[trial] = {} + if not basename in self.values[trial]: + self.values[trial][basename] = {} + self.values[trial][basename][time] = value + + # export the processed primal values for graphing + with open("primal_values.json", "w") as out: + json.dump(self.values, out, indent=3) + if not zip_name: + if hasattr(self, zip_name): + zip_name = self.zip_name + if zip_name: + with ZipFile(zip_name, "a", compression=ZIP_LZMA) as zp: + zp.write("primal_values.json") + os.remove("primal_values.json") + + if graphs != []: + self.graph(graphs, zip_name=zip_name) + + if "optimal" in solution: + print("The solution is optimal.") + else: + raise FeasibilityError( + f"The solution is sub-optimal, with a {solution} status." + ) + + def graph( + self, + graphs=[], + primal_values_filename: str = None, + primal_values_zip_path: str = None, + zip_name: str = None, + data_timestep_hr: float = 0.163, + ): + def add_plot(ax, labels, basename, trial): + labels.append(basename.split("-")[-1]) + ax.plot( + self.values[trial][basename].keys(), + self.values[trial][basename].values(), + label=basename, + ) + ax.legend(labels) + ax.set_xticks( + list(self.values[trial][basename].keys())[ + :: int(2 / data_timestep_hr / timestep_ratio) + ] + ) + return ax, labels + + timestep_ratio = 1 + if self.parameters != {}: + data_timestep_hr = self.parameters["data_timestep_hr"] + timestep_ratio = ( + self.parameters["data_timestep_hr"] / self.parameters["timestep_hr"] + ) + if primal_values_filename: + if primal_values_zip_path: + with ZipFile(primal_values_zip_path, "r") as zp: + zp.extract(primal_values_filename) + with open(primal_values_filename, "r", encoding="utf-8") as primal: + self.values = json.load(primal) + + # plot the content for desired trials + self.plots = [] + for graph in graphs: + if any([x in graph["content"] for x in ["total", "OD"]]): + ys = [] + print(graph) + pyplot.rcParams["figure.figsize"] = (11, 7) + pyplot.rcParams["figure.dpi"] = 150 + fig, ax = pyplot.subplots() + y_label = "Variable value" + x_label = "Time (hr)" + for trial, basenames in self.values.items(): + content = graph["content"] + if graph["content"] == "OD": + y_label = "Biomass (g)" + graph["phenotype"] = graph["species"] = "*" + elif "biomass" in graph["content"]: + content = "b" + y_label = "Biomass (g)" + elif graph["content"] == "growth": + content = "g" + y_label = "Biomass (g/hr)" + elif "stress-test" in graph["content"]: + content = graph["content"].split("_")[1] + y_label = graph["species"] + " coculture %" + x_label = content + " (mM)" + if trial == graph["trial"]: + labels: list = [] + for basename in basenames: + # parse for non-concentration variables + if any([x in graph["content"] for x in ["total", "OD"]]): + if "b_" in basename: + if graph["content"] == "OD": + labels.append("predicted") + label = "predicted" + xs = np.array( + list(self.values[trial][basename].keys()) + ) + ys.append( + np.array( + list(self.values[trial][basename].values()) + ) + ) + elif graph["content"] == "total": + if graph["species"] in basename: + labels.append("total_biomass") + label = "total_biomass" + xs = np.array( + list(self.values[trial][basename].keys()) + ) + ys.append( + np.array( + list( + self.values[trial][ + basename + ].values() + ) + ) + ) + if ( + "experimental_data" in graph + and graph["experimental_data"] + ): + if basename == "OD__bio": + labels.append("experimental") + exp_xs = np.array( + list(self.values[trial][basename].keys()) + ) + exp_xs = exp_xs.astype(np.float32) + exp_xs = np.around(exp_xs, 2) + ax.plot( + exp_xs, + list(self.values[trial][basename].values()), + label="experimental", + ) + ax.set_xticks( + exp_xs[ + :: int( + 2 / data_timestep_hr / timestep_ratio + ) + ] + ) + elif graph["phenotype"] == "*" and all( + [x in basename for x in [graph["species"], content]] + ): + if "total" in graph["content"]: + labels = [basename] + xs = np.array(list(self.values[trial][basename].keys())) + ys.append( + np.array( + list(self.values[trial][basename].values()) + ) + ) + else: + ax, labels = add_plot(ax, labels, basename, trial) + print("1") + # + elif all( + [ + x in basename + for x in [graph["species"], graph["phenotype"], content] + ] + ): + ax, labels = add_plot(ax, labels, basename, trial) + print("2") + # concentration plots + elif "EX_" in basename and graph["content"] in basename: + ax, labels = add_plot(ax, labels, basename, trial) + y_label = "Concentration (mM)" + print("3") + + if labels != []: + if any([x in graph["content"] for x in ["total", "OD"]]): + xs = xs.astype(np.float32) + xs = np.around(xs, 2) + ax.plot(xs, sum(ys), label=label) + ax.set_xticks( + xs[:: int(2 / data_timestep_hr / timestep_ratio)] + ) + phenotype_id = ( + graph["phenotype"] + if graph["phenotype"] != "*" + else "all phenotypes" + ) + species_id = ( + graph["species"] + if graph["species"] != "*" + else "all species" + ) + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + if len(labels) > 1: + ax.legend() + ax.set_title( + f'{graph["content"]} of {species_id} ({phenotype_id}) in the {trial} trial' + ) + fig_name = f'{"_".join([trial, species_id, phenotype_id, graph["content"]])}.jpg' + fig.savefig(fig_name) + self.plots.append(fig_name) + + # combine the figures with the other cotent + if not zip_name: + if hasattr(self, zip_name()): + zip_name = self.zip_name + if zip_name: + with ZipFile(zip_name, "a", compression=ZIP_LZMA) as zp: + for plot in self.plots: + zp.write(plot) + os.remove(plot) + + def load_model( + self, mscomfit_json_path: str, zip_name: str = None, class_object: bool = False + ): + if zip_name: + with ZipFile(zip_name, "r") as zp: + zp.extract(mscomfit_json_path) + with open(mscomfit_json_path, "r") as mscmft: + model = Model.from_json(json.load(mscmft)) + if class_object: + self.problem = model + return model + + def change_parameters( + self, + cvt=None, + cvf=None, + diff=None, + vmax=None, + mscomfit_json_path="mscommfitting.json", + export_zip_name=None, + extract_zip_name=None, + final_concentrations: dict = None, + final_relative_carbon_conc: float = None, + previous_relative_conc: float = None, + ): + def change_param(arg, param, time, trial): + if param: + if not isinstance(param, dict): + arg[0]["value"] = param + else: + if time in param: + if trial in param[time]: + arg[0]["value"] = param[time][trial] + arg[0]["value"] = param[time] + else: + arg[0]["value"] = param["default"] + return arg + + time_1 = process_time() + if not export_zip_name: + export_zip_name = self.zip_name + if not os.path.exists(mscomfit_json_path): + if not extract_zip_name: + extract_zip_name = self.zip_name + with ZipFile(extract_zip_name, "r") as zp: + zp.extract(mscomfit_json_path) + with open(mscomfit_json_path, "r") as mscmft: + mscomfit_json = json.load(mscmft) + else: + with open("mscommfitting.json", "r") as mscmft: + mscomfit_json = json.load(mscmft) + + time_2 = process_time() + print(f"Done loading the JSON: {(time_2-time_1)/60} min") + + # change objective coefficients + for arg in mscomfit_json["objective"]["expression"]["args"]: + name, time, trial = arg["args"][1]["name"].split("-") + if "cvf" in name: + arg["args"] = change_param(arg["args"], cvf, time, trial) + elif "cvt" in name: + arg["args"] = change_param(arg["args"], cvt, time, trial) + elif "diff" in name: + arg["args"] = change_param(arg["args"], diff, time, trial) + + # change final concentrations + if final_concentrations: # absolute concentration + for met in mscomfit_json["variables"]: + name, time, trial = met["name"].split("-") + if ( + name in final_concentrations + and time == self.simulation_timesteps[-1] + ): + met["lb"] = 0 + met["ub"] = final_concentrations[met] + + if final_relative_carbon_conc: # relative concentration + for met in mscomfit_json["variables"]: + if "EX_" in met["name"]: + name, time, trial = met["name"].split("-") + if ( + any([x in name for x in self.parameters["carbon_sources"]]) + and time == self.simulation_timesteps[-1] + ): + print(met["ub"]) + met["lb"] = 0 + met["ub"] *= final_relative_carbon_conc + if previous_relative_conc: + met["ub"] /= previous_relative_conc + print(met["ub"]) + + # change Vmax values + for arg in mscomfit_json["constraints"]: + name, time, trial = arg["name"].split("-") + if "gc" in name: + arg["expression"]["args"][1]["args"] = change_param( + arg["expression"]["args"][1]["args"], vmax, time, trial + ) + + with open(mscomfit_json_path, "w") as mscmft: + json.dump(mscomfit_json, mscmft, indent=3) + with ZipFile(export_zip_name, "a", compression=ZIP_LZMA) as zp: + zp.write(mscomfit_json_path) + os.remove(mscomfit_json_path) + time_3 = process_time() + print(f"Done exporting the model: {(time_3-time_2)/60} min") + + self.problem = Model.from_json(mscomfit_json) + time_4 = process_time() + print( + f"Done loading the model: {(time_4-time_3)/60} min" + ) # ~1/2 the defining a new problem + + def introduce_km( + self, vmax, km, met, graphs, zipname, extract_zipname + ): # Good starting values to try are: vmax = 3.75; km = 2.5 : Equivalent of vmax = 0.5 because at starting maltose of 5 this is vmax/(km + [maltose]) = 3.75/(2.5+5) = 0.5 + vmax_var = {"default": -0.3} + last_conc = {} + count = 0 + while 1: # Dangerous - if there's never convergence, then this never stops + error = None + for t in self.variables["c_" + met]: + if t not in vmax_var: + vmax_var[t] = {} + if t not in last_conc: + last_conc[t] = {} + for trial in self.variables["c_" + met][t]: + if trial in last_conc[t]: + error += ( + last_conc[t][trial] + - self.variables["c_" + met][t][trial].primal + ) ** 2 + last_conc[t][trial] = self.variables["c_" + met][t][trial].primal + vmax_var[t][trial] = -1 * vmax / (km + last_conc[t][trial]) + count += 1 + # Not sure if I'm using the vmax argument right here... please check + self.change_parameters( + vmax_var, zipname, extract_zipname + ) # The Vmax argument can be either a number or a dictionary that is organized by ["time"]["trial"], just as the naming scheme of the variables and constraints + self.compute(graphs, zipname) + if error: + error = (error / count) ** 0.5 + print("Error:", error) + if ( + error < 1 + ): # Definitely don't know what the error threshold should actually be for convergence + break + + def parameter_optimization( + self, + ): + with ZipFile(self.zip_name, "r") as zp: + zp.extract("mscommfitting.json") + + newton diff --git a/modelseedpy/community/mscommunity.py b/modelseedpy/community/mscommunity.py index be8472e2..f1ce5e75 100644 --- a/modelseedpy/community/mscommunity.py +++ b/modelseedpy/community/mscommunity.py @@ -125,8 +125,15 @@ def __init__( lp_filename=None, # specify a filename to create an lp file ): # Setting model and package manager - self.model, self.lp_filename, self.pfba = model, lp_filename, pfba - self.pkgmgr = MSPackageManager.get_pkg_mgr(model) + if isinstance(model, MSModelUtil): + self.model = model.model + self.mdlutl = model + else: + self.model = model + self.mdlutl = MSModelUtil.get(model) + self.pkgmgr = MSPackageManager.get_pkg_mgr(self.model) + self.lp_filename = lp_filename + self.pfba = pfba self.gapfillings = {} # Define Data attributes as None self.solution = ( @@ -142,7 +149,7 @@ def __init__( ) = self.kinetic_coeff = self.modelseed_db_path = None self.species = DictList() # Computing data from model - msid_cobraid_hash = FBAHelper.msid_hash(model) + msid_cobraid_hash = self.mdlutl.msid_hash() if "cpd11416" not in msid_cobraid_hash: logger.critical("Could not find biomass compound") other_biomass_cpds = [] @@ -151,6 +158,7 @@ def __init__( self.biomass_cpd = biomass_cpd for reaction in model.reactions: if self.biomass_cpd in reaction.metabolites: + print(reaction.id, reaction.metabolites) if ( reaction.metabolites[self.biomass_cpd] == 1 and len(reaction.metabolites) > 1 @@ -165,13 +173,14 @@ def __init__( other_biomass_cpds.append(biomass_cpd) for biomass_cpd in other_biomass_cpds: species_obj = CommunityModelSpecies(self, biomass_cpd, names) + print(species_obj.index,species_obj.id) self.species.append(species_obj) if abundances: self.set_abundance(abundances) @staticmethod def build_from_species_models( - models, mdlid=None, name=None, names=[], abundances=None + models, mdlid=None, name=None, names=[], abundances=None,basemodel=None ): """Merges the input list of single species metabolic models into a community metabolic model @@ -196,8 +205,11 @@ def build_from_species_models( Raises ------ """ - newmodel = Model(mdlid, name) - newutl = MSModelUtil(newmodel) + if basemodel: + newmodel = basemodel + else: + newmodel = Model(mdlid, name) + newutl = MSModelUtil.get(newmodel) biomass_compounds = [] index = 1 biomass_index = 2 @@ -230,7 +242,7 @@ def build_from_species_models( met.id = output[0] + "_" + output[1] + str(index) if met.id not in newmodel.metabolites: new_metabolites.append(met) - if met.id == "cpd11416": + if newutl.metabolite_msid(met) == "cpd11416": biomass_compounds.append(met) # Rename reactions for rxn in model.reactions: diff --git a/modelseedpy/community/mskineticsfba.py b/modelseedpy/community/mskineticsfba.py new file mode 100644 index 00000000..9309b455 --- /dev/null +++ b/modelseedpy/community/mskineticsfba.py @@ -0,0 +1,444 @@ +# -*- coding: utf-8 -*- + +from scipy.constants import milli, hour, minute, day, femto +from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg +from modelseedpy import MSModelUtil +from optlang import Constraint +from modelseedpy.core.fbahelper import FBAHelper +from collections import OrderedDict +from optlang.symbolics import Zero +from numpy import log10, nan, mean +from warnings import warn +from matplotlib import pyplot +from pprint import pprint +from datetime import date +from math import inf +import pandas +import json, re, os + + +def _x_axis_determination(total_time): + time = total_time * minute + if time <= 600: + return minute, "s" + if time > 600: + return 1, "min" + if time > 7200: + return 1 / hour, "hr" + return 1 / day, "days" + + +def _check_datum(datum): + if "substituted_rate_law" not in datum: + print(f"RateLawError: The {datum} datum lacks a rate law.") + return False + remainder = re.sub("([0-9A-Za-z/()e\-\+\.\*\_])", "", datum["substituted_rate_law"]) + if remainder != "": + print( + f'RateLawError: The {datum["substituted_rate_law"]}' + f" rate law contains unknown characters: {remainder}" + ) + return False + return True + + +class MSKineticsFBA: + def __init__( + self, + model, + warnings: bool = True, + verbose: bool = False, + printing: bool = False, + jupyter: bool = False, + ): + self.warnings, self.verbose, self.printing, self.jupyter = ( + warnings, + verbose, + printing, + jupyter, + ) + self.model_util = MSModelUtil(model) + self.met_ids = OrderedDict( + {met.id: met.id for met in self.model_util.model.metabolites} + ) + + def baseKinFBA( + self, + kinetics_path: str = None, + kinetics_data: dict = None, + initial_M: dict = None, # a dictionary of the initial metabolic concentrations, which supplants concentrations from the defined kinetics data + total_min: float = 200, + ts_min: float = 20, + export_name=None, + export_directory=None, + chemostat_L: float = None, + feed_profile: dict = None, + chemostat_L_hr: float = None, + temperature: float = 25, + p_h: float = 7, + cell_dry_g: float = 1.44e-13, + cellular_L: float = 1e-18, + conc_figure_title="Metabolic perturbation", + included_mets: list = None, + labeled_plots=True, + visualize=True, + export=True, + ): + # define the dataframe for the time series content + feed_profile, constrained, self.constraints = feed_profile or {}, {}, {} + included_mets, self.sols = included_mets or [], [] + self.parameters = { + "timesteps": int(total_min / ts_min), + "pH": p_h, + "temperature": temperature, + } + self.variables = {"elapsed_time": 0} + self.ts_min, self.minimum = ts_min, inf + timestep_hr = self.ts_min / (hour / minute) + self.constrained = OrderedDict() + cell_g_L = ( + cell_dry_g / cellular_L + ) # https://journals.asm.org/doi/full/10.1128/AEM.64.2.688-694.1998 + + # define reaction kinetics and initial concentrations + assert ( + kinetics_path or kinetics_data + ), "Either < kinetics_path > or < kinetics_data > must be provided" + if kinetics_path: + with open(kinetics_path) as data: + self.kinetics_data = json.load(data) + elif kinetics_data: + self.kinetics_data = kinetics_data.copy() + ## define the concentration, moles, and fluxes DataFrames + self.time = "0 min" + self.conc = pandas.DataFrame( + [0] * len(self.met_ids), + index=list(self.met_ids.keys()), + columns=[self.time], + ) + self.conc.index.name = "metabolite (mM)" + self.moles = self.conc.copy(deep=True) + self.fluxes = pandas.DataFrame( + index=[rxn.id for rxn in self.model_util.model.reactions], + columns=[self.time], + ) + self.fluxes.index.name = "reaction (\u0394mmol/hr*g_(dw)))" # Delta + ## parse the kinetics data + for content in self.kinetics_data.values(): + for condition, datum in content.items(): + if "initial_M" not in datum: + continue + for var, conc in datum["initial_M"].items(): + met_id = datum["met_id"][var] + if met_id in self.met_ids: + self.conc.at[met_id, self.time] += conc / milli + elif self.warnings: + warn( + f"KineticsError: The {met_id} reagent ({var}) in the" + f" {datum['substituted_rate_law']} rate law is not defined by the model." + ) + ## incorporate custom initial concentrations, which overwrites values from the kinetics data + for met_id in initial_M: + self.conc.at[met_id, self.time] = initial_M[met_id] / milli + defined_concs = self.conc[self.conc[self.time] != 0][self.time].to_dict() + chemostat_requirements = [ + chemostat_L is not None, + feed_profile != {}, + chemostat_L_hr is not None, + ] + # execute FBA for each timestep, then calculate custom fluxes, constrain the model, and update concentrations + model_rxns = [rxn.id for rxn in self.model_util.model.reactions] + newTime = 0 + for timestep in range(1, self.parameters["timesteps"] + 1): + oldTime = newTime + newTime = timestep * self.ts_min + t = timestep * timestep_hr + self.previous_time = f"{oldTime} min" + self.time = f"{newTime} min" + self.conc[self.time] = [float(0)] * len(self.conc.index) + self.fluxes[self.time] = [0] * len(self.fluxes.index) + ## create a metabolite variable that prevents negative concentrations + for met in self.model_util.model.metabolites: + if met.id not in defined_concs: + continue + if met.id not in self.constraints: + self.constraints[met.id] = {} + coef = {} + for rxn in met.reactions: + ### The product of the reaction stoichiometry and the timestep + stoich = abs(timestep_hr * rxn.metabolites[met]) + coef[rxn.forward_variable], coef[rxn.reverse_variable] = ( + stoich, + -stoich, + ) + ### build the metabolite constraint + if newTime - self.ts_min in self.constraints[met.id]: + self.model_util.remove_cons_vars( + [self.constraints[met.id][newTime - self.ts_min]] + ) + self.constraints[met.id][newTime] = Constraint( + Zero, lb=0, ub=None, name=f"{met.id}_conc" + ) + self.model_util.create_constraint( + self.constraints[met.id][newTime], coef + ) + ## calculate the flux + display(self.conc[self.conc["0 min"] != 0], self.fluxes) + for rxnID in self.kinetics_data: + # TODO allocate the following code into a function and recusively reduce the timestep until + ## the concentration becomes not negative, following the model of microBialSim. This may require + ## time dependency in the kinetics expression to achieve the desired behavior. + if rxnID not in model_rxns and self.warnings: + warn(f"ReactionError: {rxnID} is not in the model.") + continue + fluxes = [] + for source in self.kinetics_data[rxnID]: + datum = self.kinetics_data[rxnID][source] + if not _check_datum(datum): + continue + ### define rate law variables; calculate flux; average or overwrite the flux based on data criteria + locals().update( + { + metID: self.conc.at[metID, self.previous_time] * milli + for metID in datum["mets"] + } + ) + flux = eval(datum["substituted_rate_law"]) + print(datum["substituted_rate_law"], flux) + if ( + "metadata" not in self.kinetics_data[rxnID][source] + or self.__find_data_match(rxnID, source) == "a" + ): + fluxes.append(flux) + else: + fluxes = [flux] + + flux = mean(fluxes) + rxn = self.model_util.model.reactions.get_by_id(rxnID) + rxn.lb = rxn.ub = flux + self.fluxes.at[rxnID, self.time] = flux + ## execute the COBRA model + sol = self.model_util.model.optimize() + self.sols.append(sol) + ## add previously undefined fluxes and concentrations + for rxnID in self.fluxes.index: + if self.fluxes.at[rxnID, self.time] == 0: + self.fluxes.at[rxnID, self.time] = sol.fluxes[rxnID] + for met in self.model_util.model.metabolites: + self.conc.at[met.id, self.time] = 0 + for rxn in met.reactions: + flux = self.fluxes.at[rxn.id, self.time] + if flux == 0: + continue + # print(rxn.metabolites[met], flux, timestep_hr, cell_g_L) + self.conc.at[met.id, self.time] += ( + rxn.metabolites[met] * flux * timestep_hr * cell_g_L + ) + if all(chemostat_requirements): + self.moles[self.time] = self.conc[self.time] * milli * chemostat_L + self._chemostat(feed_profile, chemostat_L_hr, chemostat_L) + elif any(chemostat_requirements): + warn( + "The < chemostat_L > , < feed_profile >, and < chemostat_L_hr >" + " parameters must all be defined to simulate a chemostat." + ) + self.variables["elapsed_time"] += self.ts_min + if self.printing: + print( + f"\nObjective value (\u0394t{self.ts_min}): ", + self.sols[-1].objective_value, + ) + + # identify the chemicals that dynamically changed in concentrations + self.changed = set( + [ + met_id + for met_id in self.met_ids + if self.conc.at[met_id, "0 min"] != self.conc.at[met_id, self.time] + ] + ) + self.unchanged = set(self.met_ids.keys()) - self.changed + + # visualize concentration changes over time + if visualize: + self._visualize(conc_figure_title, included_mets, labeled_plots) + if export: + self._export(export_name, export_directory, total_min) + if self.verbose: + print( + f"\nChanged concentrations:\t{self.changed}", + f"\nConstrained reactions:\t{constrained.keys()}", + ) + elif self.printing: + if self.jupyter: + pandas.set_option("max_rows", None) + display(self.conc, self.fluxes) + if self.unchanged == set(): + print( + "All of the metabolites changed concentration over the simulation" + ) + else: + print(f"\nUnchanged metabolite concentrations\t{self.unchanged}") + return self.conc, self.fluxes + + def _chemostat(self, feed_profile: dict, chemostat_L_hr, chemostat_L): + L_changed = chemostat_L_hr * self.ts_min + # chemostat addition + for met_id, conc in feed_profile.items(): + self.moles.at[met_id, self.time] += conc * L_changed + self.conc.at[met_id, self.time] = ( + self.moles.at[met_id, self.time] / milli / chemostat_L + ) # normalize to the chemostat volume + # chemostat subtraction + for met in self.model_util.model.metabolites: + if met.compartment[0] != "e": + continue + ## update the chemical moles + self.moles.at[met.id, self.time] -= ( + self.conc.at[met.id, self.time] * L_changed + ) + ## define the chemical concentration + self.conc.at[met.id, self.time] = ( + self.moles.at[met.id, self.time] / milli / chemostat_L + ) + + # nested functions + def __find_data_match(self, rxnID: str, source: str): + # identifies the datum whose experimental conditions most closely matches the simulation conditions + temperature_deviation = ph_deviation = 0 + if FBAHelper.isnumber( + self.kinetics_data[rxnID][source]["metadata"]["Temperature"] + ): + temp = float(self.kinetics_data[rxnID][source]["metadata"]["Temperature"]) + temperature_deviation = ( + abs(self.parameters["temperature"] - temp) + / self.parameters["temperature"] + ) + if FBAHelper.isnumber(self.kinetics_data[rxnID][source]["metadata"]["pH"]): + pH = float(self.kinetics_data[rxnID][source]["metadata"]["pH"]) + ph_deviation = abs(self.parameters["pH"] - pH) / self.parameters["pH"] + + # equally weight between temperature and pH deviation from the simulation conditions + old_minimum = self.minimum + deviation = mean(temperature_deviation, ph_deviation) + self.minimum = min(deviation, self.minimum) + return ( + "a" if old_minimum == self.minimum else "w" + ) # append or write a list of data + + def _visualize(self, conc_fig_title, included_mets, labeled_plots): + # TODO construct a Vega visualization with a range bind that permits scanning over a time series + ## and accordingly adjusting arrowhead widths to reflect flux at the particularly timestep. + ## The heatmap may likewise be dynamic for each timestep over a bind range. + + # define the figure + pyplot.rcParams["figure.figsize"] = (11, 7) + pyplot.rcParams["figure.dpi"] = 150 + self.figure, ax = pyplot.subplots() + ax.set_title(conc_fig_title) + ax.set_ylabel("Concentrations (mM)") + + x_axis_scalar, unit = _x_axis_determination(self.total_min) + ax.set_xlabel("Time " + unit) + legend_list = [] + times = [ + t * self.ts_min * x_axis_scalar + for t in range(self.parameters["timesteps"] + 1) + ] + + # determine the plotted metabolites and the scale of the figure axis + bbox = (1, 1) + if not included_mets: + bbox = (1.7, 1) + # 1e-2 is an arbitrary concentration threshold for plotting on the figure + included_mets = [ + chem + for chem in self.changed + if max(self.conc.loc[[chem]].values[0].tolist()) > 1e-2 + ] + + log_axis = False + minimum, maximum = inf, -inf + printed_concentrations = {} + for chem in self.changed: + if chem not in included_mets: + continue + concentrations = self.conc.loc[[chem]].values[0].tolist() + maximum = max(maximum, max([x if x > 1e-9 else 0 for x in concentrations])) + minimum = min(minimum, min([x if x > 1e-9 else 0 for x in concentrations])) + # plot chemicals with perturbed concentrations + ax.plot(times, concentrations) + if len(chem) > 25: + chem = list(self.met_ids.keys())[self.met_ids.index(chem)] + if not concentrations[0] < 1e-9: + legend_list.append(chem) + else: + legend_list.append(f"(rel) {chem}") + + # design the proper location of the overlaid labels in the figure + if not labeled_plots: + continue + for i, conc in enumerate(concentrations): + if conc <= 1e-9: + continue + x_value = i * self.ts_min + vertical_adjustment = 0 + if x_value in printed_concentrations: + vertical_adjustment = (maximum - minimum) * 0.05 + if log_axis: + vertical_adjustment = log10(maximum - minimum) / 3 + ax.text( + x_value, + conc + vertical_adjustment, + f"{chem} - {round(conc, 4)}", + ha="left", + ) + printed_concentrations[x_value] = conc + break + + # finalize figure details + if maximum > 10 * minimum: + ax.set_yscale("log") + ax.set_xticks(times) + ax.grid(True) + ax.legend( + legend_list, + title="Changed chemicals", + loc="upper right", + bbox_to_anchor=bbox, + title_fontsize="x-large", + fontsize="large", + ) + + def _export(self, export_name="kineticsFBA", export_directory: str = None): + # define a unique simulation name + directory = ( + os.path.dirname(export_directory) if export_directory else os.getcwd() + ) + self.parameters["simulation_path"] = self.simulation_path = os.path.join( + directory, export_name + ) + # export simulation content + self.fluxes.to_csv(os.path.join(self.simulation_path, "fluxes.csv")) + self.conc.to_csv(os.path.join(self.simulation_path, "concentrations.csv")) + obj_vals_df = pandas.DataFrame( + [ + (self.fluxes.columns[index].replace(" min", ""), sol.objective_value) + for index, sol in enumerate(self.sols) + ], + columns=["min", "objective_value"], + ) + obj_vals_df.index = obj_vals_df["min"] + obj_vals_df.drop(["min"], axis=1, inplace=True) + obj_vals_df.to_csv(os.path.join(self.simulation_path, "objective_values.csv")) + # export the parameters + parameters_table = pandas.DataFrame( + self.parameters, columns=["parameter", "value"] + ) + parameters_table.to_csv(os.path.join(self.simulation_path, "parameters.csv")) + # export the figure + self.figure.savefig( + os.path.join(self.simulation_path, "changed_concentrations.svg") + ) + if self.verbose and not self.jupyter: + self.figure.show() diff --git a/modelseedpy/community/mssteadycom.py b/modelseedpy/community/mssteadycom.py new file mode 100644 index 00000000..db851e34 --- /dev/null +++ b/modelseedpy/community/mssteadycom.py @@ -0,0 +1,438 @@ +from modelseedpy import FBAHelper +from modelseedpy.core.exceptions import ( + ObjectAlreadyDefinedError, + ParameterError, + NoFluxError, +) + +# from modelseedpy.community.commhelper import build_from_species_models, CommHelper +from optlang import Constraint, Variable +from itertools import combinations +from optlang.symbolics import Zero +from pandas import DataFrame, concat +from matplotlib import pyplot +from numpy import array +import networkx +import sigfig +import os, re + + +def add_collection_item( + met_name, + normalized_flux, + flux_threshold, + ignore_mets, + species_collection, + first, + second, +): + if flux_threshold and normalized_flux <= flux_threshold: + return species_collection + if not any([re.search(x, met_name, flags=re.IGNORECASE) for x in ignore_mets]): + species_collection[first][second].append(re.sub(r"(_\w\d$)", "", met_name)) + return species_collection + + +class MSSteadyCom: + + @staticmethod + def run_fba( + mscommodel, + media, + pfba=False, + fva_reactions=None, + ava=False, + minMemGrwoth: float = 1, + interactions=True, + ): + + # minGrowth = Constraint(name="minMemGrowth", lb=, ub=None) + # mscommodel.model.add_cons_vars + + # fix member abundances + if not mscommodel.abundances_set: + for member in mscommodel.members: + member.biomass_cpd.lb = minMemGrwoth + all_metabolites = {mscommodel.primary_biomass.products[0]: 1} + all_metabolites.update( + { + mem.biomass_cpd: 1 / len(mscommodel.members) + for mem in mscommodel.members + } + ) + mscommodel.primary_biomass.add_metabolites(all_metabolites, combine=False) + # TODO constrain fluxes to be proportional to the relative abundance + + # TODO constrain the sum of fluxes to be proportional with the abundance + sol = mscommodel.run_fba(media, pfba, fva_reactions) + if interactions: + return MSSteadyCom.interactions(mscommodel, sol) + if ava: + return MSSteadyCom.abundance_variability_analysis(mscommodel, sol) + + @staticmethod + def abundance_variability_analysis(mscommodel, media): + variability = {} + for mem in mscommodel.members: + variability[mem.id] = {} + # minimal variability + mscommodel.set_objective(mem.biomasses, minimize=True) + variability[mem.id]["minVar"] = mscommodel.run_fba(media) + # maximal variability + mscommodel.set_objective(mem.biomasses, minimize=False) + variability[mem.id]["maxVar"] = mscommodel.run_fba(media) + return variability + + @staticmethod + def interactions( + mscommodel, # The MSCommunity object of the model (mandatory to prevent circular imports) + solution=None, # the COBRA simulation solution that will be parsed and visualized + media=None, # The media in which the community model will be simulated + # names=None, abundances=None, # names and abundances of the community species + flux_threshold: int = 1, # The threshold of normalized flux below which a reaction is not plotted + msdb=None, + msdb_path: str = None, + visualize: bool = True, # specifies whether the net flux will be depicted in a network diagram + filename: str = "cross_feeding", # Cross-feeding figure export name + export_format: str = "svg", + node_metabolites: bool = True, # specifies whether the metabolites of each node will be printed + show_figure: bool = True, # specifies whether the figure will be printed to the console + ignore_mets=None, # cross-fed exchanges that will not be displayed in the graphs + ): + # verify that the model has a solution and parallelize where the solver is permissible + solver = str(type(mscommodel.util.model.solver)) + print(f"{solver} model loaded") + if "gurobi" in solver: + mscommodel.util.model.problem.Params.Threads = os.cpu_count() / 2 + solution = solution or mscommodel.run_fba(media) + if not solution: + raise ParameterError( + "A solution must be provided, from which interactions are computed." + ) + if all(array(list(solution.fluxes.values)) == 0): + print(list(solution.fluxes.values)) + raise NoFluxError("The simulation lacks any flux.") + + # Initialize data + metabolite_data, species_data, species_collection = ( + {}, + {"Environment": {}}, + {"Environment": {}}, + ) + data = {"IDs": [], "Metabolites/Donor": [], "Environment": []} + species_list = {} + + # track extracellularly exchanged metabolites + exchange_mets_list = mscommodel.util.exchange_mets_list() + for met in exchange_mets_list: + data["IDs"].append(met.id) + data["Metabolites/Donor"].append(re.sub(r"(_\w\d$)", "", met.name)) + metabolite_data[met.id] = {"Environment": 0} + metabolite_data[met.id].update( + {individual.id: 0 for individual in mscommodel.members} + ) + + # computing net metabolite flux from each reaction + # print([mem.id for mem in mscommodel.members]) + for individual in mscommodel.members: + species_data[individual.id], species_collection[individual.id] = {}, {} + species_list[individual.index] = individual + data[individual.id] = [] + for other in mscommodel.members: + species_data[individual.id][other.id] = 0 + species_collection[individual.id][other.id] = [] + species_data["Environment"][individual.id] = species_data[individual.id][ + "Environment" + ] = 0 + species_collection["Environment"][individual.id] = [] + species_collection[individual.id]["Environment"] = [] + + for rxn in mscommodel.util.model.reactions: + if rxn.id[0:3] == "EX_": + cpd = list(rxn.metabolites.keys())[0] + # the Environment takes the opposite perspective to the members + metabolite_data[cpd.id]["Environment"] += -solution.fluxes[rxn.id] + rxn_index = int(FBAHelper.rxn_compartment(rxn)[1:]) + if ( + not any([met not in exchange_mets_list for met in rxn.metabolites]) + or rxn_index not in species_list + ): + continue + for met in rxn.metabolites: + if met.id not in metabolite_data: + continue + metabolite_data[met.id][species_list[rxn_index].id] += ( + solution.fluxes[rxn.id] * rxn.metabolites[met] + ) + + # translating net metabolite flux into species interaction flux + ignore_mets = ignore_mets if ignore_mets is not None else ["h2o_e0", "co2_e0"] + for met in exchange_mets_list: + # Iterating through the metabolite producers + # TODO Why are fluxes normalized? + total = sum( + [ + max([metabolite_data[met.id][individual.id], 0]) + for individual in mscommodel.members + ] + ) + max([metabolite_data[met.id]["Environment"], 0]) + for individual in mscommodel.members: + ## calculate metabolic consumption of a species from the environment + if metabolite_data[met.id][individual.id] < Zero: + if metabolite_data[met.id]["Environment"] <= Zero: + continue + normalized_flux = ( + abs( + metabolite_data[met.id][individual.id] + * metabolite_data[met.id]["Environment"] + ) + / total + ) + species_data["Environment"][individual.id] += normalized_flux + species_collection = add_collection_item( + met.name, + normalized_flux, + flux_threshold, + ignore_mets, + species_collection, + "Environment", + individual.id, + ) + ## calculate and track metabolic donations between a member and another or the environment + elif metabolite_data[met.id][individual.id] > Zero: + for other in mscommodel.members: + ### filter against organisms that do not consume + if metabolite_data[met.id][other.id] >= Zero: + continue + normalized_flux = ( + abs( + metabolite_data[met.id][individual.id] + * metabolite_data[met.id][other.id] + ) + / total + ) + species_data[individual.id][other.id] += normalized_flux + species_collection = add_collection_item( + met.name, + normalized_flux, + flux_threshold, + ignore_mets, + species_collection, + individual.id, + other.id, + ) + ## calculate donations to the environment + if metabolite_data[met.id]["Environment"] >= Zero: + continue + normalized_flux = ( + abs( + metabolite_data[met.id][individual.id] + * metabolite_data[met.id]["Environment"] + ) + / total + ) + species_data[individual.id]["Environment"] += normalized_flux + species_collection = add_collection_item( + met.name, + normalized_flux, + flux_threshold, + ignore_mets, + species_collection, + individual.id, + "Environment", + ) + + # construct the dataframes + for metID in metabolite_data: + for individual in mscommodel.members: + data[individual.id].append(metabolite_data[metID][individual.id]) + data["Environment"].append(metabolite_data[metID]["Environment"]) + + ## process the fluxes dataframe + data["IDs"].append("zz_Environment") + data["Metabolites/Donor"].append(0) + for individual in mscommodel.members: + data[individual.id].append(species_data["Environment"][individual.id]) + data["Environment"].append(0) + for individual in mscommodel.members: + for other in mscommodel.members: + data[individual.id].append(species_data[individual.id][other.id]) + data["Environment"].append(species_data[individual.id]["Environment"]) + data["IDs"].append(f"zz_Species{individual.index}") + data["Metabolites/Donor"].append(individual.id) + + # if len(set(list(map(len, list(data.values()))))) != 1: + # print([(col, len(content)) for col, content in data.items()]) + cross_feeding_df = DataFrame(data) + cross_feeding_df.index = [ + ID.replace("_e0", "") for ID in map(str, cross_feeding_df["IDs"]) + ] + cross_feeding_df.index.name = "Metabolite/Donor ID" + cross_feeding_df.drop(["IDs", "Metabolites/Donor"], axis=1, inplace=True) + cross_feeding_df = cross_feeding_df.loc[(cross_feeding_df != 0).any(axis=1)] + cross_feeding_df.sort_index(inplace=True) + + ## process the identities dataframe + exchanged_mets = {"Environment": [" "], "Donor ID": ["Environment"]} + exchanged_mets.update({ind.id: [] for ind in mscommodel.members}) + for individual in mscommodel.members: + ### environment exchanges + exchanged_mets[individual.id].append( + "; ".join(species_collection["Environment"][individual.id]) + ) + exchanged_mets["Environment"].append( + "; ".join(species_collection[individual.id]["Environment"]) + ) + ### member exchanges + exchanged_mets["Donor ID"].append(individual.id) + for other in mscommodel.members: + exchanged_mets[individual.id].append( + "; ".join(species_collection[individual.id][other.id]) + ) + + # if len(set(list(map(len, list(exchanged_mets.values()))))) != 1: + # print([(col, len(content)) for col, content in exchanged_mets.items()]) + exMets_df = DataFrame(exchanged_mets) + exMets_df.index = [ + ID.replace("_e0", "") for ID in map(str, exMets_df["Donor ID"]) + ] + exMets_df.index.name = "Donor ID" + exMets_df.drop(["Donor ID"], axis=1, inplace=True) + exMets_df.sort_index(inplace=True) + exMets_df.fillna(" ") + + # graph the network diagram + if visualize: + MSSteadyCom.visual_interactions( + cross_feeding_df, + filename, + export_format, + msdb, + msdb_path, + show_figure, + node_metabolites, + ) + + return cross_feeding_df, exMets_df + + @staticmethod + def visual_interactions( + cross_feeding_df, + filename="cross_feeding", + export_format="svg", + msdb=None, + msdb_path=None, + view_figure=True, + node_metabolites=True, + ): + # load the MSDB + assert msdb or msdb_path, ValueError( + "Either the MSDB object or the local MSDB path must be provided" + ) + from modelseedpy.biochem import from_local + + msdb = msdb or from_local(msdb_path) + # construct the structure of the cross-feeding DataFrame + if "Metabolite/Donor ID" in cross_feeding_df.columns: + cross_feeding_df.index = [ + metID.replace("_e0", "") + for metID in cross_feeding_df["Metabolite/Donor ID"].values + ] + cross_feeding_df.index.name = "Metabolite/Donor ID" + cross_feeding_df.drop( + [col for col in cross_feeding_df.columns if "ID" in col], + axis=1, + inplace=True, + ) + else: + cross_feeding_df.index = [ + metID.replace("_e0", "") for metID in cross_feeding_df.index + ] + # define the cross-fed metabolites + cross_feeding_rows = [] + for index, row in cross_feeding_df.iterrows(): + positive = negative = False + for col, val in row.items(): + if col not in ["Environment"]: + if val > 1e-4: + positive = True + elif val < -1e-4: + negative = True + if negative and positive: + cross_feeding_rows.append(row) + break + metabolites_df = concat(cross_feeding_rows, axis=1).T + metabolites_df.index.name = "Metabolite ID" + display(metabolites_df) + metabolites = [ + msdb.compounds.get_by_id(metID.replace("_e0", "")) + for metID in metabolites_df.index.tolist() + if metID not in ["cpdETCM", "cpdETCMe"] + ] + # define the community members that participate in cross-feeding + members = metabolites_df.loc[ + :, (metabolites_df != 0).any(axis=0) + ].columns.tolist() + members.remove("Environment") + members_cluster1, members_cluster2 = ( + members[: int(len(members) / 2)], + members[int(len(members) / 2) :], + ) + + # TODO define a third node tier of just the environment as a rectangle that spans the width of the members + ## which may alleviate much of the ambiguity about mass imbalance between the member fluxes + import graphviz + + dot = graphviz.Digraph(filename, format=export_format) # directed graph + # define nodes + ## top-layer members + # TODO hyperlink the member nodes with their Narrative link + dot.attr("node", shape="rectangle", color="lightblue2", style="filled") + for mem in members_cluster1: + index = members.index(mem) + dot.node(f"S{index}", mem) + ## mets in the middle layer + with dot.subgraph(name="mets") as mets_subgraph: + mets_subgraph.attr(rank="same") + mets_subgraph.attr("node", shape="circle", color="green", style="filled") + for metIndex, met in enumerate(metabolites): + mets_subgraph.node( + met.abbr[:3], + fixedsize="true", + height="0.4", + tooltip=f"{met.id} ; {met.name}", + URL=f"https://modelseed.org/biochem/compounds/{met.id}", + ) + ## bottom-layer members + with dot.subgraph(name="members") as members_subgraph: + members_subgraph.attr(rank="same") + for mem in members_cluster2: + index = members.index(mem) + dot.node(f"S{index}", mem) + # define the edges by parsing the interaction DataFrame + for met in metabolites: + row = metabolites_df.loc[met.id] + maxVal = max(list(row.to_numpy())) + for col, val in row.items(): + if col == "Environment": + continue + index = members.index(col) + # TODO color carbon sources red + if val > 0: + dot.edge( + f"S{index}", + met.abbr[:3], + arrowsize=f"{val / maxVal}", + edgetooltip=str(val), + ) + if val < 0: + dot.edge( + met.abbr[:3], + f"S{index}", + arrowsize=f"{abs(val / maxVal)}", + edgetooltip=str(val), + ) + + # render and export the source + dot.render(filename, view=view_figure) + return dot.source diff --git a/modelseedpy/community/steadycom_template.html b/modelseedpy/community/steadycom_template.html new file mode 100644 index 00000000..b894c7f7 --- /dev/null +++ b/modelseedpy/community/steadycom_template.html @@ -0,0 +1,54 @@ + + + + + + SteadyCom Results + + + + + + + + + + +

SteadyCom Results

+ + + + \ No newline at end of file diff --git a/modelseedpy/config.cfg b/modelseedpy/config.cfg index 3aee00c7..e46ec871 100644 --- a/modelseedpy/config.cfg +++ b/modelseedpy/config.cfg @@ -1,3 +1,5 @@ +[biochem] +path = /deps/ModelSEEDDatabase [data] template_folder = data/templates classifier_folder = data/ml diff --git a/modelseedpy/core/__init__.py b/modelseedpy/core/__init__.py index 7e16d262..bd374a03 100644 --- a/modelseedpy/core/__init__.py +++ b/modelseedpy/core/__init__.py @@ -9,6 +9,9 @@ from modelseedpy.core.mseditorapi import MSEditorAPI, MSEquation from modelseedpy.core.msgapfill import MSGapfill from modelseedpy.core.msatpcorrection import MSATPCorrection -from modelseedpy.core.msgrowthphenotypes import MSGrowthPhenotypes +from modelseedpy.core.msgrowthphenotypes import MSGrowthPhenotypes, MSGrowthPhenotype from modelseedpy.core.msmodelutl import MSModelUtil +from modelseedpy.core.mstemplate import MSTemplateBuilder +from modelseedpy.core.msmodelreport import MSModelReport +from modelseedpy.core.annotationontology import AnnotationOntology from modelseedpy.core.exceptions import * diff --git a/modelseedpy/core/annotationontology.py b/modelseedpy/core/annotationontology.py new file mode 100644 index 00000000..3b7c533f --- /dev/null +++ b/modelseedpy/core/annotationontology.py @@ -0,0 +1,493 @@ +# -*- coding: utf-8 -*- +import logging +import re +import time +import json +import sys +import pandas as pd +import cobra +from cobra import DictList +from modelseedpy.core.msgenome import MSGenome + +# from builtins import None + +logger = logging.getLogger(__name__) +logger.setLevel( + logging.INFO +) # When debugging - set this to INFO then change needed messages below from DEBUG to INFO + +# Class structure +# AnnotationOntology -> Features/Events/Terms/Ontologies +# AnnotationOntologyOntology -> Events/Terms +# AnnotationOntologyEvent -> Features/Ontology +# AnnotationOntologyFeature -> Term+Event->Evidence +# AnnotationOntologyTerm -> Ontology/Events/Featurs +# AnnotationOntologyEvidence -> -- + +allowable_score_types = [ + "probability", + "evalue", + "bitscore", + "identity", + "qalignstart", + "qalignstop", + "salignstart", + "salignstop", + "kmerhits", + "tmscore", + "rmsd", + "hmmscore", + "score" +] + +def convert_to_search_role(role): + role = role.lower() + role = re.sub("\s","",role) + role = re.sub("[\d\-]+\.[\d\-]+\.[\d\-]+\.[\d\-]*","",role) + role = re.sub("\#.*$","",role) + role = re.sub("\(ec:*\)","",role) + role = re.sub("[\(\)\[\],-]","",role) + return role + +def split_role(role): + return re.split("\s*;\s+|\s+[\@\/]\s+",role) + +class AnnotationOntologyEvidence: + def __init__(self, parent, event, term, probability=1, scores={}, ref_entity=None, entity_type=None): + self.parent = parent + self.event = event + self.term = term + self.probability = probability + self.ref_entity = ref_entity + self.entity_type = entity_type + self.scores = scores + #for item in self.scores: + #if item not in allowable_score_types: + #logger.warning(item + " not an allowable score type!") + + def to_data(self): + output = { + "event":self.event.method, + "term":self.term.id, + "ontology":self.term.ontology.id, + "probability":self.probability + } + if self.ref_entity: + output["ref_entity"] = self.ref_entity + if self.entity_type: + output["entity_type"] = self.entity_type + if self.scores: + output["scores"] = self.scores + return output + + +class AnnotationOntologyTerm: + def __init__(self, parent, term_id, ontology): + self.id = term_id + self.parent = parent + self.ontology = ontology + self.ontology.add_term(self) + self.parent.add_term(self) + self.msrxns = set() + self.events = {} + self.features = {} + + def add_msrxns(self, rxn_ids): + for rxn_id in rxn_ids: + if rxn_id[0:6] == "MSRXN:": + rxn_id = rxn_id[6:] + self.msrxns.update([rxn_id]) + + def add_event(self, event): + self.events[event.id] = event + + def add_feature(self, feature): + self.features[feature.id] = feature + + +class AnnotationOntologyOntology: + def __init__(self, parent, ontology_id): + self.id = ontology_id + self.parent = parent + self.events = {} + self.terms = {} + + def add_event(self, event): + self.events[event.id] = event + + def add_term(self, term): + self.terms[term.id] = term + + +class AnnotationOntologyFeature: + def __init__(self, parent, feature_id, type=None): + self.id = feature_id + self.parent = parent + parent.add_feature(self) + self.type = type + self.event_terms = {} + self.term_events = {} + + def add_event_term(self, event, term, scores={}, ref_entity=None, entity_type=None,probability=1): + if event.id not in self.event_terms: + self.event_terms[event.id] = {} + self.event_terms[event.id][term.id] = AnnotationOntologyEvidence( + self,event,term,probability=probability,scores=scores,ref_entity=ref_entity,entity_type=entity_type + ) + if term.id not in self.term_events: + self.term_events[term.id] = {} + self.term_events[term.id][event.id] = self.event_terms[event.id][term.id] + + def get_associated_terms( + self, + prioritized_event_list=None, + ontologies=None, + merge_all=False, + translate_to_rast=False, + ): + output = {} + for term_id in self.term_events: + term = self.parent.terms[term_id] + if not ontologies or term.ontology.id in ontologies: + if merge_all or not prioritized_event_list: + for event_id in self.term_events[term_id]: + if ( + not prioritized_event_list + or event_id in prioritized_event_list + ): + if term not in output: + output[term] = [] + output[term].append( + self.term_events[term_id][event_id].to_data() + ) + else: + for event_id in prioritized_event_list: + if event_id in self.term_events[term_id]: + rxns = self.parent.terms[term_id].msrxns + if len(rxns) > 0: + if term not in output: + output[term] = [] + output[term].append( + self.term_events[term_id][event_id].to_data() + ) + break + return output + + def get_associated_reactions( + self, prioritized_event_list=None, ontologies=None, merge_all=False + ): + output = {} + for term_id in self.term_events: + if not ontologies or self.parent.terms[term_id].ontology.id in ontologies: + if merge_all or not prioritized_event_list: + for event_id in self.term_events[term_id]: + if ( + not prioritized_event_list + or event_id in prioritized_event_list + ): + rxns = self.parent.terms[term_id].msrxns + for rxn_id in rxns: + if rxn_id not in output: + output[rxn_id] = [] + output[rxn_id].append( + self.term_events[term_id][event_id].to_data() + ) + else: + for event_id in prioritized_event_list: + if event_id in self.term_events[term_id]: + rxns = self.parent.terms[term_id].msrxns + for rxn_id in rxns: + if rxn_id not in output: + output[rxn_id] = [] + output[rxn_id].append( + self.term_events[term_id][event_id].to_data() + ) + if len(rxns) > 0: + break + return output + + +class AnnotationOntologyEvent: + def __init__( + self, + parent, + event_id, + ontology_id, + method, + method_version=None, + description=None, + timestamp=None, + ): + self.id = event_id + self.parent = parent + # Linking ontology + self.ontology = self.parent.add_ontology(ontology_id) + self.ontology.add_event(self) + if not description: + self.description = "" # TODO + else: + self.description = description + self.method = method + self.method_version = method_version + self.timestamp = timestamp + self.features = {} + + @staticmethod + def from_data(data, parent): + if "method_version" not in data: + data["method_version"] = None + if "description" not in data: + data["description"] = None + if "timestamp" not in data: + data["timestamp"] = None + self = AnnotationOntologyEvent( + parent, + data["event_id"], + data["ontology_id"], + data["method"], + data["method_version"], + data["description"], + data["timestamp"], + ) + if "ontology_terms" in data: + for feature_id in data["ontology_terms"]: + feature = self.parent.add_feature(feature_id) + self.add_feature(feature) + for item in data["ontology_terms"][feature_id]: + term = self.parent.add_term(item["term"], self.ontology) + scores = {} + ref_entity = None + entity_type = None + if "evidence" in item: + if "scores" in item["evidence"]: + scores = item["evidence"]["scores"] + if "reference" in item["evidence"]: + ref_entity = item["evidence"]["reference"][1] + entity_type = item["evidence"]["reference"][0] + probability = 1/len(data["ontology_terms"][feature_id]) + feature.add_event_term(self, term, scores, ref_entity, entity_type,probability) + if "modelseed_ids" in item: + term.add_msrxns(item["modelseed_ids"]) + return self + + def add_feature(self, feature): + self.features[feature.id] = feature + + def to_data(self): + data = { + "event_id": self.event_id, + "description": self.event_id, + "ontology_id": self.ontology_id, + "method": self.method, + "method_version": self.method_version, + "timestamp": self.timestamp, + "ontology_terms": {}, + } + for feature in self.features: + data["ontology_terms"][feature] = {"term": None} # TODO + + +class AnnotationOntology: + mdlutls = {} + + @staticmethod + def from_kbase_data(data, genome_ref=None, data_dir=None): + self = AnnotationOntology(genome_ref, data_dir) + if "feature_types" in data: + self.feature_types = data["feature_types"] + if "events" in data: + for event in data["events"]: + self.events += [AnnotationOntologyEvent.from_data(event, self)] + return self + + def __init__(self, genome_ref, data_dir): + self.genome_ref = genome_ref + self.events = DictList() + self.terms = {} + self.ontologies = {} + self.genes = {} + self.cdss = {} + self.data_dir = data_dir + self.noncodings = {} + self.feature_types = {} + self.term_names = {} + self.info = None + + def get_term_name(self, term): + if term.ontology.id not in self.term_names: + self.term_names[term.ontology.id] = {} + if term.ontology.id in [ + "SSO", + "AntiSmash", + "EC", + "TC", + "META", + "RO", + "KO", + "GO", + ]: + with open( + self.data_dir + "/" + term.ontology.id + "_dictionary.json" + ) as json_file: + ontology = json.load(json_file) + for item in ontology["term_hash"]: + self.term_names[term.ontology.id][item] = ontology["term_hash"][ + item + ]["name"] + if term.id not in self.term_names[term.ontology.id]: + return "Unknown" + return self.term_names[term.ontology.id][term.id] + + def get_gene_term_hash( + self, + prioritized_event_list=None, + ontologies=None, + merge_all=False, + feature_type=None, + translate_to_rast=True, + ): + output = {} + feature_hash = self.genes + if len(self.genes) == 0 or (feature_type == "cds" and len(self.cdss) > 0): + feature_hash = self.cdss + for feature_id in feature_hash: + if not feature_type or feature_type == self.feature_types[feature_id]: + feature = feature_hash[feature_id] + if feature not in output: + output[feature] = {} + output[feature] = feature.get_associated_terms( + prioritized_event_list, ontologies, merge_all, translate_to_rast + ) + return output + + def get_reaction_gene_hash( + self, + prioritized_event_list=None, + ontologies=None, + merge_all=False, + cds_features=False, + feature_type=None + ): + output = {} + feature_hash = self.genes + if len(self.genes) == 0 or (cds_features and len(self.cdss) == 0): + feature_hash = self.cdss + for feature_id in feature_hash: + if not feature_type or feature_type == self.feature_types[feature_id]: + reactions = feature_hash[feature_id].get_associated_reactions( + prioritized_event_list, ontologies, merge_all + ) + for rxn_id in reactions: + if rxn_id not in output: + output[rxn_id] = {} + if feature_id not in output[rxn_id]: + output[rxn_id][feature_id] = {"probability": 0, "evidence": []} + for item in reactions[rxn_id]: + output[rxn_id][feature_id]["evidence"].append(item) + for rxn_id in output: + total_prob = 0 + for feature_id in output[rxn_id]: + sub_total_prob = 0 + for evidence in output[rxn_id][feature_id]["evidence"]: + sub_total_prob += evidence["probability"] + output[rxn_id][feature_id]["probability"] = sub_total_prob + total_prob += sub_total_prob + for feature_id in output[rxn_id]: + output[rxn_id][feature_id]["probability"] = ( + output[rxn_id][feature_id]["probability"] / total_prob + ) + return output + + def add_term(self, term_or_id, ontology=None): + if not isinstance(term_or_id, AnnotationOntologyTerm): + if term_or_id in self.terms: + return self.terms[term_or_id] + else: + return AnnotationOntologyTerm(self, term_or_id, ontology) + if term_or_id.id in self.terms: + logger.critical("Term with id " + term_or_id.id + " already in annotation!") + return self.terms[term_or_id.id] + else: + self.terms[term_or_id.id] = term_or_id + + def add_ontology(self, ontology_or_id): + if not isinstance(ontology_or_id, AnnotationOntologyOntology): + if ontology_or_id in self.ontologies: + return self.ontologies[ontology_or_id] + else: + return AnnotationOntologyOntology(self, ontology_or_id) + if ontology_or_id.id in self.ontologies: + logger.critical( + "Ontology with id " + ontology_or_id.id + " already in annotation!" + ) + return self.ontologies[ontology_or_id.id] + else: + self.ontologies[ontology_or_id.id] = ontology_or_id + + def get_feature_hash(self, feature_id): + feature_hash = self.genes + if feature_id in self.feature_types: + if self.feature_types[feature_id] == "cds": + feature_hash = self.cdss + elif self.feature_types[feature_id] == "noncoding": + feature_hash = self.noncodings + return feature_hash + + def add_feature(self, feature_or_id): + feature_hash = None + if not isinstance(feature_or_id, AnnotationOntologyFeature): + feature_hash = self.get_feature_hash(feature_or_id) + if feature_or_id in feature_hash: + return feature_hash[feature_or_id] + else: + feature_or_id = AnnotationOntologyFeature(self, feature_or_id) + if not feature_hash: + feature_hash = self.get_feature_hash(feature_or_id.id) + if feature_or_id.id not in feature_hash: + feature_hash[feature_or_id.id] = feature_or_id + return feature_hash[feature_or_id.id] + + def get_msgenome(self,prioritized_event_list=None,ontologies=None,merge_all=False,feature_type=None,translate_to_rast=True): + newgenome = MSGenome.from_annotation_ontology( + self, prioritized_event_list, ontologies, merge_all,feature_type, translate_to_rast + ) + newgenome.annoont = self + return newgenome + + def get_events_from_priority_list(self,priority_list): + event_list = [] + for item in priority_list: + selected_merge = None + for event in self.events: + if item == "all": + if event.id not in event_list: + event_list.append(event.id) + elif item == "RAST": + if len(event.method) > 4 and event.method[0:4] == "RAST" and event.id not in event_list: + event_list.append(event.id) + elif item == "Prokka": + if len(event.method) > 6 and event.method[0:6] == "Prokka" and event.id not in event_list: + event_list.append(event.id) + elif item == "DRAM": + if len(event.method) > 4 and event.method[0:4] == "DRAM" and event.id not in event_list: + event_list.append(event.id) + elif item == "GLM4EC": + if len(event.method) > 6 and event.method[0:6] == "GLM4EC" and event.id not in event_list: + event_list.append(event.id) + elif item == "PDB": + if event.method == "KBAnnotationApps.PDBAnnotation" and event.ontology.id == "EC" and event.id not in event_list: + event_list.append(event.id) + elif item == "SNEKMER": + if len(event.method) > 7 and event.method[0:7] == "Snekmer" and event.id not in event_list: + event_list.append(event.id) + elif item == "Import": + if len(event.method) > 6 and event.method[0:6] == "Import" and event.id not in event_list: + event_list.append(event.id) + elif item == "Merge": + if len(event.method) > 5 and event.method[0:5] == "Merge" and event.id not in event_list: + selected_merge = event.id + elif item.lower() in event.description.lower() or item.lower() in event.id.lower(): + event_list.append(event.id) + if selected_merge: + event_list.append(selected_merge) + return event_list \ No newline at end of file diff --git a/modelseedpy/core/exceptions.py b/modelseedpy/core/exceptions.py index ce708956..e3e01211 100644 --- a/modelseedpy/core/exceptions.py +++ b/modelseedpy/core/exceptions.py @@ -1,6 +1,12 @@ # -*- coding: utf-8 -*- # Adding a few exception classes to handle different types of errors in a central file +class ModelSEEDError(Exception): + """Error in ModelSEED execution logic""" + + pass + + class FeasibilityError(Exception): """Error in FBA formulation""" @@ -18,3 +24,37 @@ class GapfillingError(Exception): """Error in model gapfilling""" pass + + +class ObjectError(Exception): + """Error in the construction of a base KBase object""" + + pass + + +class ParameterError(Exception): + """Error in a parameterization""" + + pass + + +class ObjectAlreadyDefinedError(Exception): + pass + + +class NoFluxError(Exception): + """Error for FBA solutions""" + + pass + + +class ObjectiveError(Exception): + """Erroneous assignment of a secondary objective via a constraint""" + + pass + + +class ModelError(Exception): + """Errors in a model that corrupt the simulation""" + + pass \ No newline at end of file diff --git a/modelseedpy/core/fbabuilder.py b/modelseedpy/core/fbabuilder.py new file mode 100644 index 00000000..b82e161d --- /dev/null +++ b/modelseedpy/core/fbabuilder.py @@ -0,0 +1,1546 @@ +import logging + +import re +import copy +from optlang.symbolics import Zero, add +from cobra.core import Gene, Metabolite, Model, Reaction +from cobrakbase.core.kbaseobject import AttrDict +from cobrakbase.annotation_ontology_api.annotation_ontology_apiServiceClient import ( + annotation_ontology_api, +) +import modelseedpy.core.fbahelper + +logger = logging.getLogger(__name__) + + +def build_cpd_id(str): + if str.startswith("M_"): + str = str[2:] + elif str.startswith("M-"): + str = str[2:] + str_fix = str + if "-" in str_fix: + str_fix = str_fix.replace("-", "__DASH__") + if not str == str_fix: + logger.debug("[Species] rename: [%s] -> [%s]", str, str_fix) + return str + + +def build_rxn_id(str): + if str.startswith("R_"): + str = str[2:] + elif str.startswith("R-"): + str = str[2:] + str_fix = str + if "-" in str_fix: + str_fix = str_fix.replace("-", "__DASH__") + if not str == str_fix: + logger.debug("[Reaction] rename: [%s] -> [%s]", str, str_fix) + return str_fix + + +# Adding a few exception classes to handle different types of errors +class ObjectError(Exception): + """Error in the construction of a base KBase object""" + + pass + + +class FeasibilityError(Exception): + """Error in FBA formulation""" + + pass + + +# New class to store functions to building and tracking new constraints and variables related to our own custom FBA formulations +class KBaseFBAUtilities: + def __init__( + self, + cobramodel, + fbamodel, + kbapi, + media=None, + default_uptake=100, + default_excretion=100, + blacklist=[], + auto_sink=["cpd02701_c", "cpd11416_c0", "cpd15302_c"], + ): + self.cobramodel = cobramodel + self.SBO_ANNOTATION = "sbo" + self.metabolites_remap = {} + self.solution_exclusion_constraints = [] + self.kbapi = kbapi + self.potential_variables = dict() + self.reversibility_binary = dict() + self.reversibility_binary_constraints = dict() + self.binary_flux_variables = dict() + self.total_flux_variables = dict() + self.total_flux_constraints = dict() + self.binary_flux_constraints = dict() + self.simple_thermo_constraints = dict() + self.metabolomics_peak_variables = dict() + self.metabolomics_peak_constraints = dict() + self.compound_flux_variables = dict() + self.compound_flux_constraints = dict() + self.metabolomics_constraints = dict() + self.media = None + self.default_uptake = default_uptake + self.default_excretion = default_excretion + self.apply_media_to_model(media, self.default_uptake, self.default_excretion) + self.blacklist = [ + "rxn12985", + "rxn00238", + "rxn07058", + "rxn05305", + "rxn00154", + "rxn09037", + "rxn10643", + "rxn11317", + "rxn05254", + "rxn05257", + "rxn05258", + "rxn05259", + "rxn05264", + "rxn05268", + "rxn05269", + "rxn05270", + "rxn05271", + "rxn05272", + "rxn05273", + "rxn05274", + "rxn05275", + "rxn05276", + "rxn05277", + "rxn05278", + "rxn05279", + "rxn05280", + "rxn05281", + "rxn05282", + "rxn05283", + "rxn05284", + "rxn05285", + "rxn05286", + "rxn05963", + "rxn05964", + "rxn05971", + "rxn05989", + "rxn05990", + "rxn06041", + "rxn06042", + "rxn06043", + "rxn06044", + "rxn06045", + "rxn06046", + "rxn06079", + "rxn06080", + "rxn06081", + "rxn06086", + "rxn06087", + "rxn06088", + "rxn06089", + "rxn06090", + "rxn06091", + "rxn06092", + "rxn06138", + "rxn06139", + "rxn06140", + "rxn06141", + "rxn06145", + "rxn06217", + "rxn06218", + "rxn06219", + "rxn06220", + "rxn06221", + "rxn06222", + "rxn06223", + "rxn06235", + "rxn06362", + "rxn06368", + "rxn06378", + "rxn06474", + "rxn06475", + "rxn06502", + "rxn06562", + "rxn06569", + "rxn06604", + "rxn06702", + "rxn06706", + "rxn06715", + "rxn06803", + "rxn06811", + "rxn06812", + "rxn06850", + "rxn06901", + "rxn06971", + "rxn06999", + "rxn07123", + "rxn07172", + "rxn07254", + "rxn07255", + "rxn07269", + "rxn07451", + "rxn09037", + "rxn10018", + "rxn10077", + "rxn10096", + "rxn10097", + "rxn10098", + "rxn10099", + "rxn10101", + "rxn10102", + "rxn10103", + "rxn10104", + "rxn10105", + "rxn10106", + "rxn10107", + "rxn10109", + "rxn10111", + "rxn10403", + "rxn10410", + "rxn10416", + "rxn11313", + "rxn11316", + "rxn11318", + "rxn11353", + "rxn05224", + "rxn05795", + "rxn05796", + "rxn05797", + "rxn05798", + "rxn05799", + "rxn05801", + "rxn05802", + "rxn05803", + "rxn05804", + "rxn05805", + "rxn05806", + "rxn05808", + "rxn05812", + "rxn05815", + "rxn05832", + "rxn05836", + "rxn05851", + "rxn05857", + "rxn05869", + "rxn05870", + "rxn05884", + "rxn05888", + "rxn05896", + "rxn05898", + "rxn05900", + "rxn05903", + "rxn05904", + "rxn05905", + "rxn05911", + "rxn05921", + "rxn05925", + "rxn05936", + "rxn05947", + "rxn05956", + "rxn05959", + "rxn05960", + "rxn05980", + "rxn05991", + "rxn05992", + "rxn05999", + "rxn06001", + "rxn06014", + "rxn06017", + "rxn06021", + "rxn06026", + "rxn06027", + "rxn06034", + "rxn06048", + "rxn06052", + "rxn06053", + "rxn06054", + "rxn06057", + "rxn06059", + "rxn06061", + "rxn06102", + "rxn06103", + "rxn06127", + "rxn06128", + "rxn06129", + "rxn06130", + "rxn06131", + "rxn06132", + "rxn06137", + "rxn06146", + "rxn06161", + "rxn06167", + "rxn06172", + "rxn06174", + "rxn06175", + "rxn06187", + "rxn06189", + "rxn06203", + "rxn06204", + "rxn06246", + "rxn06261", + "rxn06265", + "rxn06266", + "rxn06286", + "rxn06291", + "rxn06294", + "rxn06310", + "rxn06320", + "rxn06327", + "rxn06334", + "rxn06337", + "rxn06339", + "rxn06342", + "rxn06343", + "rxn06350", + "rxn06352", + "rxn06358", + "rxn06361", + "rxn06369", + "rxn06380", + "rxn06395", + "rxn06415", + "rxn06419", + "rxn06420", + "rxn06421", + "rxn06423", + "rxn06450", + "rxn06457", + "rxn06463", + "rxn06464", + "rxn06466", + "rxn06471", + "rxn06482", + "rxn06483", + "rxn06486", + "rxn06492", + "rxn06497", + "rxn06498", + "rxn06501", + "rxn06505", + "rxn06506", + "rxn06521", + "rxn06534", + "rxn06580", + "rxn06585", + "rxn06593", + "rxn06609", + "rxn06613", + "rxn06654", + "rxn06667", + "rxn06676", + "rxn06693", + "rxn06730", + "rxn06746", + "rxn06762", + "rxn06779", + "rxn06790", + "rxn06791", + "rxn06792", + "rxn06793", + "rxn06794", + "rxn06795", + "rxn06796", + "rxn06797", + "rxn06821", + "rxn06826", + "rxn06827", + "rxn06829", + "rxn06839", + "rxn06841", + "rxn06842", + "rxn06851", + "rxn06866", + "rxn06867", + "rxn06873", + "rxn06885", + "rxn06891", + "rxn06892", + "rxn06896", + "rxn06938", + "rxn06939", + "rxn06944", + "rxn06951", + "rxn06952", + "rxn06955", + "rxn06957", + "rxn06960", + "rxn06964", + "rxn06965", + "rxn07086", + "rxn07097", + "rxn07103", + "rxn07104", + "rxn07105", + "rxn07106", + "rxn07107", + "rxn07109", + "rxn07119", + "rxn07179", + "rxn07186", + "rxn07187", + "rxn07188", + "rxn07195", + "rxn07196", + "rxn07197", + "rxn07198", + "rxn07201", + "rxn07205", + "rxn07206", + "rxn07210", + "rxn07244", + "rxn07245", + "rxn07253", + "rxn07275", + "rxn07299", + "rxn07302", + "rxn07651", + "rxn07723", + "rxn07736", + "rxn07878", + "rxn11417", + "rxn11582", + "rxn11593", + "rxn11597", + "rxn11615", + "rxn11617", + "rxn11619", + "rxn11620", + "rxn11624", + "rxn11626", + "rxn11638", + "rxn11648", + "rxn11651", + "rxn11665", + "rxn11666", + "rxn11667", + "rxn11698", + "rxn11983", + "rxn11986", + "rxn11994", + "rxn12006", + "rxn12007", + "rxn12014", + "rxn12017", + "rxn12022", + "rxn12160", + "rxn12161", + "rxn01267", + "rxn05294", + "rxn04656", + ] + for item in blacklist: + if item not in self.blacklist: + self.blacklist.append(item) + self.auto_sink = [] + full_id = re.compile("\d+$") + for id in auto_sink: + if full_id.search(id): + self.auto_sink.append(id) + else: + for i in range(0, 100): + newid = id + str(i) + self.auto_sink.append(newid) + + self.auto_exchange = "e0" + self.sink_compounds = set() + self.demand_compounds = set() + self.exchange_compounds = set() + self.COBRA_0_BOUND = 0 + self.COBRA_DEFAULT_LB = -1000 + self.COBRA_DEFAULT_UB = 1000 + + def media_const_hash(self): + bound_hash = dict() + if not self.media == None: + for compound in self.media.mediacompounds: + bound_hash[compound.id] = { + "lb": -1 * compound.maxFlux, + "ub": -1 * compound.minFlux, + } + return bound_hash + + def apply_media_to_model( + self, media=None, default_uptake=None, default_excretion=None + ): + self.media = media + if default_uptake == None: + default_uptake = self.default_uptake + if default_excretion == None: + default_excretion = self.default_excretion + + bound_hash = self.media_const_hash() + for reaction in self.cobramodel.reactions: + if reaction.id[0:3].lower() == "ex_": + compound = reaction.id[3:] + if compound[-3:] == "_e0": + compound = compound[:-3] + if compound in bound_hash: + reaction.lower_bound = bound_hash[compound]["lb"] + reaction.upper_bound = bound_hash[compound]["ub"] + else: + reaction.lower_bound = -1 * default_uptake + reaction.upper_bound = default_excretion + reaction.update_variable_bounds() + + def add_total_flux_constraints(self, reaction_filter=None): + for reaction in self.cobramodel.reactions: + if reaction_filter == None or reaction.id in reaction_filter: + self.total_flux_variables[ + reaction.id + ] = self.cobramodel.problem.Variable( + reaction.id + "_tot", lb=0, ub=self.COBRA_DEFAULT_UB + ) + self.cobramodel.add_cons_vars(self.total_flux_variables[reaction.id]) + self.total_flux_constraints[ + reaction.id + ] = self.cobramodel.problem.Constraint( + reaction.forward_variable + + reaction.reverse_variable + - self.total_flux_variables[reaction.id], + lb=0, + ub=0, + name=reaction.id + "_tot", + ) + self.cobramodel.add_cons_vars(self.total_flux_constraints[reaction.id]) + + def add_reversibility_binary_constraints(self, reaction_filter=None): + # Adding thermodynamic constraints + for reaction in self.cobramodel.reactions: + if reaction.id not in self.reversibility_binary and ( + reaction_filter == None or reaction.id in reaction_filter + ): + self.reversibility_binary[ + reaction.id + ] = self.cobramodel.problem.Variable( + reaction.id + "_rb", lb=0, ub=1, type="binary" + ) + self.cobramodel.add_cons_vars(self.reversibility_binary[reaction.id]) + self.reversibility_binary_constraints[reaction.id] = dict() + self.reversibility_binary_constraints[reaction.id][ + "ff" + ] = self.cobramodel.problem.Constraint( + 1000 * self.reversibility_binary[reaction.id] + - reaction.forward_variable, + lb=0, + ub=None, + name=reaction.id + "_FB", + ) + self.cobramodel.add_cons_vars( + self.reversibility_binary_constraints[reaction.id]["ff"] + ) + self.reversibility_binary_constraints[reaction.id][ + "rf" + ] = self.cobramodel.problem.Constraint( + -1000 * self.reversibility_binary[reaction.id] + - reaction.reverse_variable, + lb=-1000, + ub=None, + name=reaction.id + "_RB", + ) + self.cobramodel.add_cons_vars( + self.reversibility_binary_constraints[reaction.id]["rf"] + ) + + def set_objective_from_target_reaction(self, target_reaction, maximize=1): + target_reaction = self.cobramodel.reactions.get_by_id(target_reaction) + sense = "max" + if maximize == 0: + sense = "min" + target_objective = self.cobramodel.problem.Objective( + 1 * target_reaction.flux_expression, direction=sense + ) + self.cobramodel.objective = target_objective + return target_reaction + + def add_simple_thermo_constraints(self): + # Creating potential variables for all compounds + for metabolite in self.cobramodel.metabolites: + if metabolite.id not in self.potential_variables: + self.potential_variables[ + metabolite.id + ] = self.cobramodel.problem.Variable( + metabolite.id + "_u", lb=0, ub=1000 + ) + self.cobramodel.add_cons_vars(self.potential_variables[metabolite.id]) + # Adding thermodynamic constraints + for reaction in self.cobramodel.reactions: + if ( + reaction.id not in self.simple_thermo_constraints + and reaction.id[0:3].lower() != "ex_" + and reaction.id[0:3].lower() != "dm_" + ): + if reaction.id not in self.reversibility_binary: + self.reversibility_binary[ + reaction.id + ] = self.cobramodel.problem.Variable( + reaction.id + "_rb", lb=0, ub=1, type="binary" + ) + self.cobramodel.add_cons_vars( + self.reversibility_binary[reaction.id] + ) + self.reversibility_binary_constraints[reaction.id] = dict() + self.reversibility_binary_constraints[reaction.id][ + "ff" + ] = self.cobramodel.problem.Constraint( + 1000 * self.reversibility_binary[reaction.id] + - reaction.forward_variable, + lb=0, + ub=None, + name=reaction.id + "_FB", + ) + self.cobramodel.add_cons_vars( + self.reversibility_binary_constraints[reaction.id]["ff"] + ) + self.reversibility_binary_constraints[reaction.id][ + "rf" + ] = self.cobramodel.problem.Constraint( + -1000 * self.reversibility_binary[reaction.id] + - reaction.reverse_variable, + lb=-1000, + ub=None, + name=reaction.id + "_RB", + ) + self.cobramodel.add_cons_vars( + self.reversibility_binary_constraints[reaction.id]["rf"] + ) + self.simple_thermo_constraints[ + reaction.id + ] = self.cobramodel.problem.Constraint( + Zero, lb=0, ub=1000, name=reaction.id + "_therm" + ) + self.cobramodel.add_cons_vars( + self.simple_thermo_constraints[reaction.id] + ) + self.cobramodel.solver.update() + const_coef = {self.reversibility_binary[reaction.id]: 1000} + for metabolite in reaction.metabolites: + const_coef[ + self.potential_variables[metabolite.id] + ] = reaction.metabolites[metabolite] + self.simple_thermo_constraints[reaction.id].set_linear_coefficients( + const_coef + ) + # Updating solver one final time + self.cobramodel.solver.update() + + def add_intracellular_metabolomics_constraints( + self, peakstring, relevant_peaks=None + ): + drain_fluxes = list() + peak_array = peakstring.split(";") + compound_reactions = dict() + reaction_hash = dict() + for reaction in self.cobramodel.reactions: + reaction_hash[reaction.id] = 1 + for compound in reaction.metabolites: + if compound.id not in compound_reactions: + compound_reactions[compound.id] = dict() + compound_reactions[compound.id][reaction.id] = reaction.metabolites[ + compound + ] + compartment_tag = re.compile("_[a-z]\d+$") + for peak in peak_array: + sub_array = peak.split(":") + if len(sub_array) > 2: + peakid = sub_array[0] + if relevant_peaks == None or peakid in relevant_peaks: + coef = sub_array[1] + peak_coef = dict() + pfound = 0 + for i in range(2, len(sub_array)): + compound_list = [] + compound = sub_array[i] + if compartment_tag.search(compound): + compound_list = [compound] + else: + for i in range(0, 1000): + compound_list.append(compound + "_c" + str(i)) + for compound in compound_list: + if compound in compound_reactions: + cfound = 0 + compound_coef = dict() + for reaction in compound_reactions[compound]: + if ( + reaction[0:3].lower() != "ex_" + and reaction[0:3].lower() != "dm_" + ): + cfound = 1 + rxnobj = self.cobramodel.reactions.get_by_id( + reaction + ) + compound_coef[rxnobj.forward_variable] = 1000 + compound_coef[rxnobj.reverse_variable] = 1000 + if cfound == 1: + if compound not in self.compound_flux_variables: + self.compound_flux_variables[ + compound + ] = self.cobramodel.problem.Variable( + compound + "_f", lb=0, ub=1 + ) + self.cobramodel.add_cons_vars( + self.compound_flux_variables[compound] + ) + self.compound_flux_constraints[ + compound + ] = self.cobramodel.problem.Constraint( + Zero, lb=0, ub=None, name=compound + "_flux" + ) + self.cobramodel.add_cons_vars( + self.compound_flux_constraints[compound] + ) + compound_coef[ + self.compound_flux_variables[compound] + ] = -1 + self.cobramodel.solver.update() + self.compound_flux_constraints[ + compound + ].set_linear_coefficients(compound_coef) + peak_coef[ + self.compound_flux_variables[compound] + ] = 1 + pfound = 1 + drain_reaction = ( + self.helper.add_drain_from_metabolite_id( + self.cobramodel, compound + ) + ) + if ( + drain_reaction.id + not in self.cobramodel.reactions + ): + self.cobramodel.add_reactions([drain_reaction]) + if pfound == 1: + if peakid not in self.metabolomics_peak_variables: + self.metabolomics_peak_variables[ + peakid + ] = self.cobramodel.problem.Variable(peakid, lb=0, ub=1) + self.cobramodel.add_cons_vars( + self.metabolomics_peak_variables[peakid] + ) + self.metabolomics_peak_constraints[ + peakid + ] = self.cobramodel.problem.Constraint( + Zero, lb=0, ub=None, name=peakid + ) + self.cobramodel.add_cons_vars( + self.metabolomics_peak_constraints[peakid] + ) + peak_coef[self.metabolomics_peak_variables[peakid]] = -1 + self.cobramodel.solver.update() + self.metabolomics_peak_constraints[ + peakid + ].set_linear_coefficients(peak_coef) + + return drain_fluxes + + def convert_template_compound(self, template_compound, index, template): + base_id = template_compound.id.split("_")[0] + base_compound = template.compounds.get_by_id(base_id) + new_id = template_compound.id + new_id += str(index) + compartment = template_compound.templatecompartment_ref.split("/").pop() + compartment += str(index) + + met = Metabolite( + new_id, + formula=base_compound.formula, + name=base_compound.name, + charge=template_compound.charge, + compartment=compartment, + ) + + met.annotation[ + "sbo" + ] = "SBO:0000247" # simple chemical - Simple, non-repetitive chemical entity. + met.annotation["seed.compound"] = base_id + return met + + def convert_template_reaction( + self, template_reaction, index, template, for_gapfilling=1 + ): + array = template_reaction.id.split("_") + base_id = array[0] + new_id = template_reaction.id + new_id += str(index) + + lower_bound = template_reaction.maxrevflux + upper_bound = template_reaction.maxforflux + + direction = template_reaction.GapfillDirection + if for_gapfilling == 0: + direction = template_reaction.direction + + if direction == ">": + lower_bound = 0 + elif direction == "<": + upper_bound = 0 + + cobra_reaction = Reaction( + new_id, + name=template_reaction.name, + lower_bound=lower_bound, + upper_bound=upper_bound, + ) + + object_stoichiometry = {} + for item in template_reaction.templateReactionReagents: + metabolite_id = item["templatecompcompound_ref"].split("/").pop() + template_compound = template.compcompounds.get_by_id(metabolite_id) + compartment = template_compound.templatecompartment_ref.split("/").pop() + if compartment == "e": + metabolite_id = metabolite_id + "0" + else: + metabolite_id = metabolite_id + str(index) + + metabolite = self.cobramodel.metabolites.get_by_id(metabolite_id) + object_stoichiometry[metabolite] = item["coefficient"] + + cobra_reaction.add_metabolites(object_stoichiometry) + + cobra_reaction.annotation["sbo"] = "SBO:0000176" # biochemical reaction + cobra_reaction.annotation["seed.reaction"] = base_id + + return cobra_reaction + + def build_model_extended_for_gapfilling( + self, + extend_with_template=1, + source_models=[], + input_templates=[], + model_penalty=1, + reaction_scores={}, + ): + model_id = self.fbamodel["id"] + ".gf" + + # Determine all indecies that should be gapfilled + indexlist = [0] * 1000 + compounds = self.fbamodel["modelcompounds"] + for compound in compounds: + compartment = compound["modelcompartment_ref"].split("/").pop() + basecomp = compartment[0:1] + if not basecomp == "e": + index = compartment[1:] + index = int(index) + indexlist[index] += 1 + + # Iterating over all indecies with more than 10 intracellular compounds: + gapfilling_penalties = dict() + for i in range(0, 1000): + if indexlist[i] > 10: + if extend_with_template == 1: + new_penalties = self.temp_extend_model_index_for_gapfilling( + i, input_templates + ) + gapfilling_penalties.update(new_penalties) + if i < len(source_models) and source_models[i] != None: + new_penalties = self.mdl_extend_model_index_for_gapfilling( + i, source_models[i], model_penalty + ) + gapfilling_penalties.update(new_penalties) + # Rescaling penalties by reaction scores and saving genes + for reaction in gapfilling_penalties: + array = reaction.split("_") + rxnid = array[0] + if rxnid in reaction_scores: + highest_score = 0 + for gene in reaction_scores[rxnid]: + if highest_score < reaction_scores[rxnid][gene]: + highest_score = reaction_scores[rxnid][gene] + factor = 1 - 0.9 * highest_score + if "reverse" in gapfilling_penalties[reaction]: + penalties[reaction.id]["reverse"] = ( + factor * penalties[reaction.id]["reverse"] + ) + if "forward" in gapfilling_penalties[reaction]: + penalties[reaction.id]["forward"] = ( + factor * penalties[reaction.id]["forward"] + ) + self.cobramodel.solver.update() + return gapfilling_penalties + + # Possible new function to add to the KBaseFBAModelToCobraBuilder to extend a model with a template for gapfilling for a specific index + def mdl_extend_model_index_for_gapfilling(self, index, source_model, model_penalty): + new_metabolites = {} + new_reactions = {} + new_exchange = [] + new_demand = [] + new_penalties = dict() + local_remap = {} + + comp = re.compile("(.*_*)(.)\d+$") + for modelcompound in source_model.metabolites: + cobra_metabolite = self.convert_modelcompound(modelcompound) + original_id = cobra_metabolite.id + groups = comp.match(cobra_metabolite.compartment) + if groups[2] == "e": + cobra_metabolite.compartment = groups[1] + groups[2] + "0" + groups = comp.match(cobra_metabolite.id) + cobra_metabolite.id = groups[1] + groups[2] + "0" + else: + cobra_metabolite.compartment = groups[1] + groups[2] + str(index) + groups = comp.match(cobra_metabolite.id) + cobra_metabolite.id = groups[1] + groups[2] + str(index) + if ( + cobra_metabolite.id not in self.cobramodel.metabolites + and cobra_metabolite.id not in new_metabolites + ): + new_metabolites[cobra_metabolite.id] = cobra_metabolite + if original_id in self.auto_sink: + self.demand_compounds.add(cobra_metabolite.id) + new_demand.append(cobra_metabolite) + if cobra_metabolite.compartment == self.auto_exchange: + self.exchange_compounds.add(cobra_metabolite.id) + new_exchange.append(cobra_metabolite) + if cobra_metabolite.id in self.cobramodel.metabolites: + cobra_metabolite = self.cobramodel.metabolites.get_by_id( + cobra_metabolite.id + ) + else: # Just in case the same compound is added twice - we want to switch the metabolite to the first new version + cobra_metabolite = new_metabolites[cobra_metabolite.id] + local_remap[original_id] = cobra_metabolite + # Adding all metabolites to model prior to adding reactions + self.cobramodel.add_metabolites(new_metabolites.values()) + + for modelreaction in source_model.reactions: + if modelreaction.id.split("_")[0] in self.blacklist: + next + # cobra_reaction = self.convert_modelreaction(modelreaction) + cobra_reaction = modelreaction.copy() + groups = comp.match(cobra_reaction.id) + cobra_reaction.id = groups[1] + groups[2] + str(index) + new_penalties[cobra_reaction.id] = dict() + # Updating metabolites in reaction to new model + metabolites = cobra_reaction.metabolites + new_stoichiometry = {} + for metabolite in metabolites: + # Adding new coefficient: + new_stoichiometry[local_remap[metabolite.id]] = metabolites[metabolite] + # Zeroing out current coefficients + if local_remap[metabolite.id] != metabolite: + new_stoichiometry[metabolite] = 0 + cobra_reaction.add_metabolites(new_stoichiometry, combine=False) + if ( + cobra_reaction.id not in self.cobramodel.reactions + and cobra_reaction.id not in new_reactions + ): + new_reactions[cobra_reaction.id] = cobra_reaction + new_penalties[cobra_reaction.id]["added"] = 1 + if cobra_reaction.lower_bound < 0: + new_penalties[cobra_reaction.id]["reverse"] = model_penalty + if cobra_reaction.upper_bound > 0: + new_penalties[cobra_reaction.id]["forward"] = model_penalty + elif ( + cobra_reaction.lower_bound < 0 + and self.cobramodel.reactions.get_by_id(cobra_reaction.id).lower_bound + == 0 + ): + self.cobramodel.reactions.get_by_id( + cobra_reaction.id + ).lower_bound = cobra_reaction.lower_bound + self.cobramodel.reactions.get_by_id( + cobra_reaction.id + ).update_variable_bounds() + new_penalties[cobra_reaction.id]["reverse"] = model_penalty + new_penalties[cobra_reaction.id]["reversed"] = 1 + elif ( + cobra_reaction.upper_bound > 0 + and self.cobramodel.reactions.get_by_id(cobra_reaction.id).upper_bound + == 0 + ): + self.cobramodel.reactions.get_by_id( + cobra_reaction.id + ).upper_bound = cobra_reaction.upper_bound + self.cobramodel.reactions.get_by_id( + cobra_reaction.id + ).update_variable_bounds() + new_penalties[cobra_reaction.id]["forward"] = model_penalty + new_penalties[cobra_reaction.id]["reversed"] = 1 + + # Only run this on new exchanges so we don't readd for all exchanges + for cpd in new_exchange: + drain_reaction = self.helper.add_drain_from_metabolite_id(cpd.id) + if ( + drain_reaction.id not in self.cobramodel.reactions + and drain_reaction.id not in new_reactions + ): + new_reactions[drain_reaction.id] = drain_reaction + + # Only run this on new demands so we don't readd for all exchanges + for cpd_id in new_demand: + drain_reaction = self.helper.add_drain_from_metabolite_id( + cpd_id, + lower_bound=self.COBRA_0_BOUND, + upper_bound=self.COBRA_DEFAULT_UB, + prefix="DM_", + prefix_name="Demand for ", + sbo="SBO:0000627", + ) + if ( + drain_reaction.id not in self.cobramodel.reactions + and drain_reaction.id not in new_reactions + ): + new_reactions[drain_reaction.id] = drain_reaction + + # Adding all new reactions to the model at once (much faster than one at a time) + self.cobramodel.add_reactions(new_reactions.values()) + return new_penalties + + # Possible new function to add to the KBaseFBAModelToCobraBuilder to extend a model with a template for gapfilling for a specific index + def temp_extend_model_index_for_gapfilling(self, index, input_templates=[]): + new_metabolites = {} + new_reactions = {} + new_exchange = [] + new_demand = [] + new_penalties = dict() + template = None + if index < len(input_templates): + template = input_templates[index] + elif index in self.fbamodel["template_refs"]: + template = self.kbapi.get_from_ws(self.fbamodel["template_refs"][index]) + else: + template = self.kbapi.get_from_ws(self.fbamodel["template_ref"]) + + if template.info.type != "KBaseFBA.NewModelTemplate": + raise ObjectError( + template.info.type + " loaded when KBaseFBA.NewModelTemplate expected" + ) + + for template_compound in template.compcompounds: + tempindex = index + compartment = template_compound.templatecompartment_ref.split("/").pop() + if compartment == "e": + tempindex = 0 + + cobra_metabolite = self.convert_template_compound( + template_compound, tempindex, template + ) + if ( + cobra_metabolite.id not in self.cobramodel.metabolites + and cobra_metabolite.id not in new_metabolites + ): + new_metabolites[cobra_metabolite.id] = cobra_metabolite + self.cobramodel.add_metabolites([cobra_metabolite]) + if cobra_metabolite.id in self.auto_sink: + self.demand_compounds.add(cobra_metabolite.id) + new_demand.append(cobra_metabolite.id) + if cobra_metabolite.compartment == self.auto_exchange: + new_exchange.append(cobra_metabolite.id) + self.exchange_compounds.add(cobra_metabolite.id) + # Adding all metabolites to model prior to adding reactions + self.cobramodel.add_metabolites(new_metabolites.values()) + + for template_reaction in template.reactions: + if template_reaction.id.split("_")[0] in self.blacklist: + continue + cobra_reaction = self.convert_template_reaction( + template_reaction, index, template, 1 + ) + new_penalties[cobra_reaction.id] = dict() + if ( + cobra_reaction.id not in self.cobramodel.reactions + and cobra_reaction.id not in new_reactions + ): + # Adding any template reactions missing from the present model + new_reactions[cobra_reaction.id] = cobra_reaction + if cobra_reaction.lower_bound < 0: + new_penalties[cobra_reaction.id]["reverse"] = ( + template_reaction.base_cost + template_reaction.reverse_penalty + ) + if cobra_reaction.upper_bound > 0: + new_penalties[cobra_reaction.id]["forward"] = ( + template_reaction.base_cost + template_reaction.forward_penalty + ) + new_penalties[cobra_reaction.id]["added"] = 1 + elif template_reaction.GapfillDirection == "=": + # Adjusting directionality as needed for existing reactions + new_penalties[cobra_reaction.id]["reversed"] = 1 + if ( + self.cobramodel.reactions.get_by_id(cobra_reaction.id).lower_bound + == 0 + ): + self.cobramodel.reactions.get_by_id( + cobra_reaction.id + ).lower_bound = template_reaction.maxrevflux + self.cobramodel.reactions.get_by_id( + cobra_reaction.id + ).update_variable_bounds() + new_penalties[cobra_reaction.id]["reverse"] = ( + template_reaction.base_cost + template_reaction.reverse_penalty + ) + if ( + self.cobramodel.reactions.get_by_id(cobra_reaction.id).upper_bound + == 0 + ): + self.cobramodel.reactions.get_by_id( + cobra_reaction.id + ).upper_bound = template_reaction.maxforflux + self.cobramodel.reactions.get_by_id( + cobra_reaction.id + ).update_variable_bounds() + new_penalties[cobra_reaction.id]["forward"] = ( + template_reaction.base_cost + template_reaction.forward_penalty + ) + + # Only run this on new exchanges so we don't readd for all exchanges + for cpd_id in new_exchange: + drain_reaction = self.helper.add_drain_from_metabolite_id(cpd_id) + if drain_reaction != None and drain_reaction.id not in new_reactions: + new_reactions[drain_reaction.id] = drain_reaction + + # Only run this on new demands so we don't readd for all exchanges + for cpd_id in new_demand: + drain_reaction = self.helper.add_drain_from_metabolite_id( + cpd_id, self.COBRA_0_BOUND, self.COBRA_DEFAULT_UB, "DM_", "Demand for " + ) + if drain_reaction != None and drain_reaction.id not in new_reactions: + new_reactions[drain_reaction.id] = drain_reaction + + # Adding all new reactions to the model at once (much faster than one at a time) + self.cobramodel.add_reactions(new_reactions.values()) + return new_penalties + + def convert_modelreaction(self, reaction, bigg=False): + mr_id = reaction.id + name = reaction.name + annotation = reaction.annotation + lower_bound, upper_bound = reaction.get_reaction_constraints() + + id = build_rxn_id(mr_id) + if bigg and "bigg.reaction" in annotation: + id = annotation["bigg.reaction"] + + gpr = reaction.get_gpr() + + cobra_reaction = Reaction( + id, name=name, lower_bound=lower_bound, upper_bound=upper_bound + ) + cobra_reaction.annotation[ + self.SBO_ANNOTATION + ] = "SBO:0000176" # biochemical reaction + cobra_reaction.annotation.update(annotation) + + if id.startswith("rxn"): + cobra_reaction.annotation["seed.reaction"] = id.split("_")[0] + + cobra_reaction.add_metabolites( + self.convert_modelreaction_stoichiometry(reaction) + ) + + cobra_reaction.gene_reaction_rule = reaction.gene_reaction_rule + + for genes in gpr: + for gene in genes: + if not gene in self.genes: + self.genes[gene] = gene + + return cobra_reaction + + def convert_modelcompound(self, metabolite, bigg=False): + formula = metabolite.formula + name = metabolite.name + charge = metabolite.charge + mc_id = metabolite.id + compartment = metabolite.compartment + annotation = metabolite.annotation + + id = build_cpd_id(mc_id) + + if bigg and "bigg.metabolite" in annotation: + id = annotation["bigg.metabolite"] + "_" + compartment + # print(id) + + met = Metabolite( + id, formula=formula, name=name, charge=charge, compartment=compartment + ) + + met.annotation[ + self.SBO_ANNOTATION + ] = "SBO:0000247" # simple chemical - Simple, non-repetitive chemical entity. + if id.startswith("cpd"): + met.annotation["seed.compound"] = id.split("_")[0] + met.annotation.update(annotation) + return met + + def convert_modelreaction_stoichiometry(self, reaction): + object_stoichiometry = {} + s = reaction.stoichiometry + for metabolite_id in s: + if metabolite_id in self.metabolites_remap: + object_stoichiometry[ + self.cobramodel.metabolites.get_by_id( + self.metabolites_remap[metabolite_id] + ) + ] = s[metabolite_id] + return object_stoichiometry + + def create_binary_variables(self, rxnobj, forward=1, reverse=1): + if rxnobj.id not in self.binary_flux_variables: + self.binary_flux_variables[rxnobj.id] = dict() + self.binary_flux_constraints[rxnobj.id] = dict() + if ( + forward == 1 + and rxnobj.upper_bound > 0 + and "forward" not in self.binary_flux_variables[rxnobj.id] + ): + self.binary_flux_variables[rxnobj.id][ + "forward" + ] = self.cobramodel.problem.Variable( + rxnobj.id + "_fb", lb=0, ub=1, type="binary" + ) + self.cobramodel.add_cons_vars( + self.binary_flux_variables[rxnobj.id]["forward"] + ) + self.binary_flux_constraints[rxnobj.id][ + "forward" + ] = self.cobramodel.problem.Constraint( + 1000 * self.binary_flux_variables[rxnobj.id]["forward"] + - rxnobj.forward_variable, + lb=0, + ub=None, + name=rxnobj.id + "_fb", + ) + self.cobramodel.add_cons_vars( + self.binary_flux_constraints[rxnobj.id]["forward"] + ) + if ( + reverse == 1 + and rxnobj.lower_bound < 0 + and "reverse" not in self.binary_flux_variables[rxnobj.id] + ): + self.binary_flux_variables[rxnobj.id][ + "reverse" + ] = self.cobramodel.problem.Variable( + rxnobj.id + "_bb", lb=0, ub=1, type="binary" + ) + self.cobramodel.add_cons_vars( + self.binary_flux_variables[rxnobj.id]["reverse"] + ) + self.binary_flux_constraints[rxnobj.id][ + "reverse" + ] = self.cobramodel.problem.Constraint( + 1000 * self.binary_flux_variables[rxnobj.id]["reverse"] + - rxnobj.forward_variable, + lb=0, + ub=None, + name=rxnobj.id + "_bb", + ) + self.cobramodel.add_cons_vars( + self.binary_flux_constraints[rxnobj.id]["reverse"] + ) + + def binary_check_gapfilling_solution( + self, gapfilling_penalties, add_solution_exclusion_constraint + ): + objcoef = {} + flux_values = self.compute_flux_values_from_variables() + for rxnobj in self.cobramodel.reactions: + if rxnobj.id in gapfilling_penalties: + if ( + "reverse" in gapfilling_penalties[rxnobj.id] + and flux_values[rxnobj.id]["reverse"] > Zero + ): + self.create_binary_variables(rxnobj, 0, 1) + objcoef[self.binary_flux_variables[rxnobj.id]["reverse"]] = 1 + if ( + "forward" in gapfilling_penalties[rxnobj.id] + and flux_values[rxnobj.id]["forward"] > Zero + ): + self.create_binary_variables(rxnobj, 1, 0) + objcoef[self.binary_flux_variables[rxnobj.id]["forward"]] = 1 + with self.cobramodel: + # Setting all gapfilled reactions not in the solution to zero + min_reaction_objective = self.cobramodel.problem.Objective( + Zero, direction="min" + ) + for rxnobj in self.cobramodel.reactions: + if rxnobj.id in gapfilling_penalties: + if ( + "reverse" in gapfilling_penalties[rxnobj.id] + and flux_values[rxnobj.id]["reverse"] <= Zero + ): + rxnobj.lower_bound = 0 + if ( + "forward" in gapfilling_penalties[rxnobj.id] + and flux_values[rxnobj.id]["forward"] <= Zero + ): + rxnobj.upper_bound = 0 + rxnobj.update_variable_bounds() + # Setting the objective to be minimization of sum of binary variables + self.cobramodel.objective = min_reaction_objective + min_reaction_objective.set_linear_coefficients(objcoef) + with open("GapfillBinary.lp", "w") as out: + out.write(str(self.cobramodel.solver)) + self.cobramodel.optimize() + flux_values = self.compute_flux_values_from_variables() + if add_solution_exclusion_constraint == 1: + self.add_binary_solution_exclusion_constraint(flux_values) + return flux_values + + # Adds a constraint that eliminates a gapfilled solution from feasibility so a new solution can be obtained + def add_binary_solution_exclusion_constraint(self, flux_values): + count = len(self.solution_exclusion_constraints) + solution_coef = {} + solution_size = 0 + for reaction in self.binary_flux_variables: + for direction in self.binary_flux_variables[reaction]: + if flux_values[reaction][direction] > Zero: + solution_size += 1 + solution_coef[self.binary_flux_variables[reaction][direction]] = 1 + if len(solution_coef) > 0: + new_exclusion_constraint = self.cobramodel.problem.Constraint( + Zero, + lb=None, + ub=(solution_size - 1), + name="exclusion." + str(count + 1), + ) + self.cobramodel.add_cons_vars(new_exclusion_constraint) + self.cobramodel.solver.update() + new_exclusion_constraint.set_linear_coefficients(solution_coef) + self.solution_exclusion_constraints.append(new_exclusion_constraint) + return new_exclusion_constraint + return None + + # Takes gapfilled penalties and creates and objective function minimizing gapfilled reactions + def create_minimal_reaction_objective(self, penalty_hash, default_penalty=0): + reaction_objective = self.cobramodel.problem.Objective(Zero, direction="min") + obj_coef = dict() + for reaction in self.cobramodel.reactions: + if reaction.id in penalty_hash: + # Minimizing gapfilled reactions + if "reverse" in penalty_hash[reaction.id]: + obj_coef[reaction.reverse_variable] = abs( + penalty_hash[reaction.id]["reverse"] + ) + elif default_penalty != 0: + obj_coef[reaction.reverse_variable] = default_penalty + if "forward" in penalty_hash[reaction.id]: + obj_coef[reaction.forward_variable] = abs( + penalty_hash[reaction.id]["forward"] + ) + elif default_penalty != 0: + obj_coef[reaction.forward_variable] = default_penalty + else: + obj_coef[reaction.forward_variable] = default_penalty + obj_coef[reaction.reverse_variable] = default_penalty + + self.cobramodel.objective = reaction_objective + reaction_objective.set_linear_coefficients(obj_coef) + + # Required this function to add gapfilled compounds to a KBase model for saving gapfilled model + def convert_cobra_compound_to_kbcompound(self, cpd, kbmodel, add_to_model=1): + refid = "cpd00000" + if re.search("cpd\d+_[a-z]+", cpd.id): + refid = cpd.id + refid = re.sub("_[a-z]\d+$", "", refid) + cpd_data = { + "aliases": [], + "charge": cpd.charge, + "compound_ref": "~/template/compounds/id/" + refid, + "dblinks": {}, + "formula": cpd.formula, + "id": cpd.id, + "inchikey": "ALYNCZNDIQEVRV-UHFFFAOYSA-M", + "modelcompartment_ref": "~/modelcompartments/id/" + cpd.id.split("_").pop(), + "name": cpd.name(), + "numerical_attributes": {}, + "string_attributes": {}, + } + cpd_data = AttrDict(cpd_data) + if add_to_model == 1: + kbmodel.modelcompounds.append(cpd_data) + return cpd_data + + # Required this function to add gapfilled reactions to a KBase model for saving gapfilled model + def convert_cobra_reaction_to_kbreaction( + self, rxn, kbmodel, direction="=", add_to_model=1 + ): + rxnref = "~/template/reactions/id/rxn00000_c" + if re.search("rxn\d+_[a-z]+", rxn.id): + rxnref = "~/template/reactions/id/" + rxn.id + rxnref = re.sub("\d+$", "", rxnref) + rxn_data = { + "id": rxn.id, + "aliases": [], + "dblinks": {}, + "direction": direction, + "edits": {}, + "gapfill_data": {}, + "maxforflux": 1000000, + "maxrevflux": 1000000, + "modelReactionProteins": [], + "modelReactionReagents": [], + "modelcompartment_ref": "~/modelcompartments/id/" + rxn.id.split("_").pop(), + "name": rxn.name, + "numerical_attributes": {}, + "probability": 0, + "protons": 0, + "reaction_ref": rxnref, + "string_attributes": {}, + } + rxn_data = AttrDict(rxn_data) + for cpd in rxn.metabolites: + if cpd.id not in kbmodel.modelcompounds: + convert_cobra_compound_to_kbcompound(cpd, kbmodel, 1) + rxn_data.modelReactionReagents.append( + { + "coefficient": rxn.metabolites[cpd], + "modelcompound_ref": "~/modelcompounds/id/" + cpd.id, + } + ) + if add_to_model == 1: + kbmodel.modelreactions.append(rxn_data) + return rxn_data + + def convert_objective_to_constraint(self, lower_bound, upper_bound): + old_obj_variable = self.cobramodel.problem.Variable( + name="old_objective_variable", lb=lower_bound, ub=upper_bound + ) + old_obj_constraint = self.cobramodel.problem.Constraint( + self.cobramodel.solver.objective.expression - old_obj_variable, + lb=0, + ub=0, + name="old_objective_constraint", + ) + self.cobramodel.add_cons_vars([old_obj_variable, old_obj_constraint]) + + def compute_flux_values_from_variables(self): + flux_values = {} + for rxnobj in self.cobramodel.reactions: + flux_values[rxnobj.id] = {} + flux_values[rxnobj.id]["reverse"] = rxnobj.reverse_variable.primal + flux_values[rxnobj.id]["forward"] = rxnobj.forward_variable.primal + return flux_values + + def compute_gapfilled_solution(self, penalties, flux_values=None): + if flux_values == None: + flux_values = self.compute_flux_values_from_variables() + output = {"reversed": {}, "new": {}} + for reaction in self.cobramodel.reactions: + if reaction.id in penalties: + if ( + flux_values[reaction.id]["forward"] > Zero + and "forward" in penalties[reaction.id] + ): + if "added" in penalties[reaction.id]: + output["new"][reaction.id] = ">" + else: + output["reversed"][reaction.id] = ">" + elif ( + flux_values[reaction.id]["reverse"] > Zero + and "reverse" in penalties[reaction.id] + ): + if "added" in penalties[reaction.id]: + output["new"][reaction.id] = "<" + else: + output["reversed"][reaction.id] = "<" + return output + + def add_gapfilling_solution_to_kbase_model(self, newmodel, penalties, media_ref): + gfid = None + if gfid == None: + largest_index = 0 + for gapfilling in newmodel.gapfillings: + current_index = gapfilling.id.split(".").pop() + if largest_index == 0 or largest_index < current_index: + largest_index = current_index + gfid = "gf." + str(largest_index + 1) + newmodel.gapfillings.append( + { + "gapfill_id": newmodel.id + "." + gfid, + "id": gfid, + "integrated": 1, + "integrated_solution": "0", + "media_ref": media_ref, + } + ) + for reaction in self.cobramodel.reactions: + if reaction.id in penalties: + if ( + reaction.forward_variable.primal > Zero + and "forward" in penalties[reaction.id] + ): + if reaction.id not in newmodel.modelreactions: + self.convert_cobra_reaction_to_kbreaction( + reaction, newmodel, ">", 1 + ) + gfrxn = newmodel.modelreactions.get_by_id(reaction.id) + gfrxn.gapfill_data[gfid] = dict() + gfrxn.gapfill_data[gfid]["0"] = [">", 1, []] + elif ( + reaction.forward_variable.primal > Zero + and "reverse" in penalties[reaction.id] + ): + if reaction.id not in newmodel.modelreactions: + self.convert_cobra_reaction_to_kbreaction( + reaction, newmodel, "<", 1 + ) + gfrxn = newmodel.modelreactions.get_by_id(reaction.id) + gfrxn.gapfill_data[gfid] = dict() + gfrxn.gapfill_data[gfid]["0"] = ["<", 1, []] + + def compute_reaction_scores(self, weigh_all_events_equally=1, weights=None): + reaction_genes = {} + if "genome_ref" in self.fbamodel: + anno_api = annotation_ontology_api() + events = anno_api.get_annotation_ontology_events( + { + "input_ref": self.fbamodel["genome_ref"], + } + ) + for event in events: + for gene in event["ontology_terms"]: + if "modelseed_ids" in event["ontology_terms"][gene]: + for rxn in event["ontology_terms"][gene]["modelseed_ids"]: + newrxn = re.sub("^MSRXN:", "", rxn) + if newrxn not in reaction_genes: + reaction_genes[newrxn] = {} + if gene not in reaction_genes[newrxn]: + reaction_genes[newrxn][gene] = 0 + if weigh_all_events_equally == 1 or weights == None: + reaction_genes[newrxn][gene] += 1 + elif event["description"] in weights: + reaction_genes[newrxn][gene] += weights[ + event["description"] + ] + elif event["event_id"] in weights: + reaction_genes[newrxn][gene] += weights[ + event["event_id"] + ] + elif event["id"] in weights: + reaction_genes[newrxn][gene] += weights[event["id"]] + return reaction_genes + + def replicate_model(self, count): + newmodel = Model(self.cobramodel.id + "_rep" + str(count)) + utilities = KBaseFBAUtilities( + newmodel, + newmodel, + self.kbapi, + self.media, + default_uptake=self.default_uptake, + default_excretion=self.default_excretion, + blacklist=self.blacklist, + ) + metabolites = [] + reactions = [] + metabolite_hash = {} + for i in range(0, count): + for metabolite in self.cobramodel.metabolites: + metabolite = metabolite.copy() + metabolite.id = metabolite.id + "__" + str(i) + metabolite_hash[metabolite.id] = metabolite + metabolites.append(metabolite) + for reaction in self.cobramodel.reactions: + reaction = reaction.copy() + reaction.id = reaction.id + "__" + str(i) + input_metabolites = {} + for metabolite in reaction.metabolites: + newid = metabolite.id + "__" + str(i) + input_metabolites[metabolite_hash[newid]] = reaction.metabolites[ + metabolite + ] + reaction.add_metabolites(input_metabolites, combine=False) + reactions.append(reaction) + newmodel.add_metabolites(metabolites) + newmodel.add_reactions(reactions) + return utilities diff --git a/modelseedpy/core/fbahelper.py b/modelseedpy/core/fbahelper.py index be17ec8a..502611d9 100644 --- a/modelseedpy/core/fbahelper.py +++ b/modelseedpy/core/fbahelper.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import - import logging from chemicals import periodic_table import re @@ -12,7 +11,6 @@ ) # !!! Gene, Metabolite, and Model are never used from cobra.util import solver as sutil # !!! sutil is never used import time -from modelseedpy.biochem import from_local from scipy.odr.odrpack import Output # !!! Output is never used from chemw import ChemMW from warnings import warn @@ -117,16 +115,26 @@ def modelseed_id_from_cobra_reaction(reaction): @staticmethod def metabolite_mw(metabolite): + fixed_masses = {"cpd11416": 1, "cpd17041": 0, "cpd17042": 0, "cpd17043": 0} + msid = FBAHelper.modelseed_id_from_cobra_metabolite(metabolite) + if msid in fixed_masses: + return fixed_masses[msid] + if not metabolite.formula: + return 0 + formula = re.sub("R\d*", "", metabolite.formula) try: - chem_mw = ChemMW() - chem_mw.mass(metabolite.formula) + chem_mw = ChemMW(printing=False) + chem_mw.mass(formula) return chem_mw.raw_mw except: - warn( + logger.warn( "The compound " + metabolite.id - + " possesses an unconventional formula {metabolite.formula}; hence, the MW cannot be computed." + + " possesses an unconventional formula " + + metabolite.formula + + "; hence, the MW cannot be computed." ) + return 0 @staticmethod def elemental_mass(): @@ -134,6 +142,8 @@ def elemental_mass(): @staticmethod def get_modelseed_db_api(modelseed_path): + from modelseedpy.biochem import from_local + return from_local(modelseed_path) @staticmethod @@ -171,7 +181,7 @@ def msid_hash(model): output = {} for met in model.metabolites: msid = FBAHelper.modelseed_id_from_cobra_metabolite(met) - if msid != None: + if msid is not None: if msid not in output: output[msid] = [] output[msid].append(met) @@ -265,6 +275,11 @@ def parse_id(cobra_obj): return (m[1], m[2], int(m[3])) return None + @staticmethod + def id_from_ref(ref): + array = ref.split("/") + return array[-1] + @staticmethod def medianame(media): if media == None: @@ -281,6 +296,54 @@ def validate_dictionary(dictionary, required_keys, optional_keys={}): dictionary[key] = optional_keys[key] return dictionary + @staticmethod + def parse_media(media): + return [cpd.id for cpd in media.data["mediacompounds"]] + + def get_reframed_model( + kbase_model, + ): + from reframed import from_cobrapy + + reframed_model = from_cobrapy(kbase_model) + if hasattr(kbase_model, "id"): + reframed_model.id = kbase_model.id + reframed_model.compartments.e0.external = True + return reframed_model + + @staticmethod + def add_vars_cons(model, vars_cons): + model.add_cons_vars(vars_cons) + model.solver.update() + return model + + @staticmethod + def update_model_media(model, media): + medium = {} + model_reactions = [rxn.id for rxn in model.reactions] + for cpd in media.data["mediacompounds"]: + ex_rxn = f"EX_{cpd.id}" + if ex_rxn not in model_reactions: + model.add_boundary( + metabolite=Metabolite(id=cpd.id, name=cpd.name, compartment="e0"), + type="exchange", + lb=cpd.minFlux, + ub=cpd.maxFlux, + ) + medium[ex_rxn] = cpd.maxFlux + model.medium = medium + return model + + @staticmethod + def filter_cobra_set(cobra_set): + unique_ids = set(obj.id for obj in cobra_set) + unique_objs = set() + for obj in cobra_set: + if obj.id in unique_ids: + unique_objs.add(obj) + unique_ids.remove(obj.id) + return unique_objs + @staticmethod def get_reframed_model( kbase_model, diff --git a/modelseedpy/core/gapfillinghelper.py b/modelseedpy/core/gapfillinghelper.py index 6c5d6afc..ed21fb00 100644 --- a/modelseedpy/core/gapfillinghelper.py +++ b/modelseedpy/core/gapfillinghelper.py @@ -1196,7 +1196,6 @@ def replicate_model(self, count): def test_reaction_additions_againt_limits(self, reactions, directions, tests): filtered_rxn = [] filtered_direction = [] - # Using "with" to ensure we don't alter the model with these tests model = self.cobramodel with model: diff --git a/modelseedpy/core/msatpcorrection.py b/modelseedpy/core/msatpcorrection.py index 1f91ac51..fdcf9c67 100644 --- a/modelseedpy/core/msatpcorrection.py +++ b/modelseedpy/core/msatpcorrection.py @@ -1,23 +1,42 @@ # -*- coding: utf-8 -*- import logging -import itertools -import cobra import json -import time -from optlang.symbolics import Zero, add -from modelseedpy.core.rast_client import RastClient -from modelseedpy.core.msgenome import normalize_role -from modelseedpy.core.msmodel import ( - get_gpr_string, - get_reaction_constraints_from_direction, -) -from cobra.core import Gene, Metabolite, Model, Reaction +import pandas as pd from modelseedpy.core.msmodelutl import MSModelUtil +from modelseedpy.core.mstemplate import MSTemplateBuilder from modelseedpy.core import FBAHelper, MSGapfill, MSMedia from modelseedpy.fbapkg.mspackagemanager import MSPackageManager +from modelseedpy.helpers import get_template logger = logging.getLogger(__name__) - +logger.setLevel( + logging.INFO +) # When debugging - set this to INFO then change needed messages below from DEBUG to INFO + +min_gap = { + "Glc.O2": 5, + "Etho.O2": 0.01, + "Ac.O2": 1, + "Pyr.O2": 3, + "Glyc.O2": 2, + "Fum.O2": 3, + "Succ.O2": 2, + "Akg.O2": 2, + "LLac.O2": 2, + "Dlac.O2": 2, + "For.O2": 1.875, + "For.NO3": 1.5, + "Pyr.NO": 2.5, + "Pyr.NO2": 2.5, + "Pyr.NO3": 2.5, + "Pyr.SO4": 2.5, +} + +default_threshold_multipiers = { + "Pyr": 2, + "Glc": 2, + "default": 1.2, +} class MSATPCorrection: @@ -25,57 +44,138 @@ class MSATPCorrection: def __init__( self, - model, - core_template, - atp_medias: list, + model_or_mdlutl, + core_template=None, + atp_medias=[], compartment="c0", - max_gapfilling=None, + max_gapfilling=10, gapfilling_delta=0, atp_hydrolysis_id=None, + load_default_medias=True, + forced_media=[], + default_media_path=None, ): """ - :param model: :param core_template: - :param atp_medias: - :param atp_objective: - :param max_gapfilling: - :param gapfilling_delta: - :param atp_hydrolysis_id: ATP Hydrolysis reaction ID, if None it will perform a SEED reaction search + :param atp_medias: list : list of additional medias to test + :param load_default_medias: Bool : load default media set + :param forced_media: list : name of medias in which ATP production should be forced + :param compartment: string : ID of compartment to test ATP in + :param max_gapfilling: string : maximum gapfilling allowed in accepted media + :param gapfilling_delta: string : difference between lowest gapfilling and current gapfilling where media will be accepted + :param atp_hydrolysis_id: string : ATP Hydrolysis reaction ID, if None it will perform a SEED reaction search """ - if isinstance(model, MSModelUtil): - self.model = model.model - self.modelutl = model + # Discerning input is model or mdlutl and setting internal links + if isinstance(model_or_mdlutl, MSModelUtil): + self.model = model_or_mdlutl.model + self.modelutl = model_or_mdlutl else: - self.model = model - self.modelutl = MSModelUtil(model) + self.model = model_or_mdlutl + self.modelutl = MSModelUtil.get(model_or_mdlutl) + # Setting atpcorrection attribute in model utl so link is bidirectional + self.modelutl.atputl = self + self.compartment = compartment + if atp_hydrolysis_id and atp_hydrolysis_id in self.model.reactions: self.atp_hydrolysis = self.model.reactions.get_by_id(atp_hydrolysis_id) else: output = self.modelutl.add_atp_hydrolysis(compartment) self.atp_hydrolysis = output["reaction"] + + self.media_hash = {} self.atp_medias = [] - for media in atp_medias: - if isinstance(media, MSMedia): - self.atp_medias.append([media, 0.01]) - else: - self.atp_medias.append(media) + + if load_default_medias: + self.load_default_medias(default_media_path) + + self.forced_media = [] + for media_id in forced_media: + for item in self.atp_medias: + if item[0].id == media_id: + print("Forced media: " + media_id) + self.forced_media.append(item[0]) + break + self.max_gapfilling = max_gapfilling self.gapfilling_delta = gapfilling_delta - self.coretemplate = core_template - self.msgapfill = MSGapfill( - self.modelutl, default_gapfill_templates=core_template - ) + + if not core_template: + self.load_default_template() + else: + self.coretemplate = core_template + + # These should stay as None until atp correction is actually run + self.msgapfill = None + self.cumulative_core_gapfilling = None + self.selected_media = None self.original_bounds = {} self.noncore_reactions = [] self.other_compartments = [] self.media_gapfill_stats = {} - self.selected_media = [] self.filtered_noncore = [] self.lp_filename = None self.multiplier = 1.2 + def get_msgapfill(self): + if self.msgapfill is None: + self.msgapfill = MSGapfill( + self.modelutl, + default_gapfill_templates=[self.coretemplate], + default_target=self.atp_hydrolysis.id, + ) + return self.msgapfill + + def load_default_template(self): + self.coretemplate = MSTemplateBuilder.from_dict( + get_template("template_core"), None + ).build() + + def load_default_medias(self, default_media_path=None): + if default_media_path is None: + import os.path as _path + + current_file_path = _path.dirname(_path.abspath(__file__)) + default_media_path = f"{current_file_path}/../data/atp_medias.tsv" + filename = default_media_path + medias = pd.read_csv(filename, sep="\t", index_col=0).to_dict() + for media_id in medias: + if media_id not in ["name","msid"]: + media_d = {} + for exchange, v in medias[media_id].items(): + if v > 0: + k = exchange.split("_")[1] + media_d[k] = v + media_d["cpd00001"] = 1000 + media_d["cpd00067"] = 1000 + media = MSMedia.from_dict(media_d) + media.id = media_id + media.name = media_id + self.atp_medias.append([media, min_gap.get(media_id, 0.01)]) + + media_ids = set() + temp_medias = self.atp_medias + self.atp_medias = [] + for media in temp_medias: + if isinstance(media, list): + if media[0].id in media_ids: + raise ValueError("media ids not unique") + media_ids.add(media[0].id) + self.atp_medias.append(media) + self.media_hash[media[0].id] = media[0] + else: + if media.id in media_ids: + raise ValueError("media ids not unique") + media_ids.add(media.id) + self.atp_medias.append([media, 0.01]) + self.media_hash[media.id] = media + if "empty" not in self.media_hash: + media = MSMedia.from_dict({}) + media.id = "empty" + media.name = "empty" + self.media_hash[media.id] = media + @staticmethod def find_reaction_in_template(model_reaction, template, compartment): template_reaction = None # we save lookup result here @@ -126,6 +226,7 @@ def disable_noncore_reactions(self): self.other_compartments = [] # Iterating through reactions and disabling for reaction in self.model.reactions: + gfrxn = self.get_msgapfill().gfmodel.reactions.get_by_id(reaction.id) if reaction.id == self.atp_hydrolysis.id: continue if FBAHelper.is_ex(reaction): @@ -150,10 +251,12 @@ def disable_noncore_reactions(self): logger.debug(reaction.id + " core but reversible") self.noncore_reactions.append([reaction, "<"]) reaction.lower_bound = 0 + gfrxn.lower_bound = 0 if reaction.upper_bound > 0 and template_reaction.upper_bound <= 0: logger.debug(reaction.id + " core but reversible") self.noncore_reactions.append([reaction, ">"]) reaction.upper_bound = 0 + gfrxn.upper_bound = 0 else: logger.debug(f"{reaction.id} non core") if FBAHelper.rxn_compartment(reaction) != self.compartment: @@ -168,8 +271,10 @@ def disable_noncore_reactions(self): self.noncore_reactions.append([reaction, ">"]) reaction.lower_bound = 0 reaction.upper_bound = 0 + gfrxn.lower_bound = 0 + gfrxn.upper_bound = 0 - def evaluate_growth_media(self): + def evaluate_growth_media(self,no_gapfilling=False): """ Determines how much gap filling each input test media requires to make ATP @@ -177,20 +282,18 @@ def evaluate_growth_media(self): """ self.disable_noncore_reactions() self.media_gapfill_stats = {} - self.msgapfill.default_gapfill_templates = [self.coretemplate] + self.get_msgapfill().default_gapfill_templates = [self.coretemplate] if self.lp_filename: - self.msgapfill.lp_filename = self.lp_filename + self.get_msgapfill().lp_filename = self.lp_filename output = {} with self.model: self.model.objective = self.atp_hydrolysis.id - # self.model.objective = self.model.problem.Objective(Zero,direction="max") - - logger.debug( - f"ATP bounds: ({self.atp_hydrolysis.lower_bound}, {self.atp_hydrolysis.upper_bound})" - ) - # self.model.objective.set_linear_coefficients({self.atp_hydrolysis.forward_variable:1}) pkgmgr = MSPackageManager.get_pkg_mgr(self.model) + # First prescreening model for ATP production without gapfilling + media_list = [] + min_objectives = {} for media, minimum_obj in self.atp_medias: + logger.debug("evaluate media %s", media) pkgmgr.getpkg("KBaseMediaPkg").build_package(media) logger.debug("model.medium %s", self.model.medium) @@ -202,99 +305,138 @@ def evaluate_growth_media(self): solution.status, ) self.media_gapfill_stats[media] = None + output[media.id] = solution.objective_value + if ( solution.objective_value < minimum_obj or solution.status != "optimal" ): - self.media_gapfill_stats[media] = self.msgapfill.run_gapfilling( - media, self.atp_hydrolysis.id, minimum_obj - ) - # IF gapfilling fails - need to activate and penalize the noncore and try again + media_list.append(media) + min_objectives[media] = minimum_obj elif solution.objective_value >= minimum_obj: self.media_gapfill_stats[media] = {"reversed": {}, "new": {}} - logger.debug( - "gapfilling stats: %s", - json.dumps(self.media_gapfill_stats[media], indent=2), + + # Now running gapfilling on all conditions where initially there was no growth + if not no_gapfilling: + all_solutions = self.get_msgapfill().run_multi_gapfill( + media_list, + target=self.atp_hydrolysis.id, + minimum_objectives=min_objectives, + prefilter=False, + check_for_growth=False, + gapfilling_mode="Independent", + run_sensitivity_analysis=False, + integrate_solutions=False, ) + print(str(all_solutions)) + # Adding the new solutions to the media gapfill stats + for media in all_solutions: + self.media_gapfill_stats[media] = all_solutions[media] if MSATPCorrection.DEBUG: + export_data = {} + for media in self.media_gapfill_stats: + export_data[media.id] = self.media_gapfill_stats[media] with open("debug.json", "w") as outfile: - json.dump(self.media_gapfill_stats[media], outfile) + json.dump(export_data, outfile) return output - def determine_growth_media(self): + def determine_growth_media(self, max_gapfilling=None): """ Decides which of the test media to use as growth conditions for this model :return: """ + atp_att = {"tests": {}, "selected_media": {}, "core_atp_gapfilling": {}} self.selected_media = [] best_score = None for media in self.media_gapfill_stats: - gfscore = 0 + atp_att["core_atp_gapfilling"][media.id] = { + "score": 0, + "new": {}, + "reversed": {}, + } if self.media_gapfill_stats[media]: - gfscore = len( + atp_att["core_atp_gapfilling"][media.id]["score"] = len( self.media_gapfill_stats[media]["new"].keys() ) + 0.5 * len(self.media_gapfill_stats[media]["reversed"].keys()) - if best_score is None or gfscore < best_score: - best_score = gfscore + atp_att["core_atp_gapfilling"][media.id][ + "new" + ] = self.media_gapfill_stats[media]["new"] + atp_att["core_atp_gapfilling"][media.id][ + "reversed" + ] = self.media_gapfill_stats[media]["reversed"] + else: + atp_att["core_atp_gapfilling"][media.id] = { + "score": 1000, + "failed": True, + } + if ( + best_score is None + or atp_att["core_atp_gapfilling"][media.id]["score"] < best_score + ): + best_score = atp_att["core_atp_gapfilling"][media.id]["score"] + if self.max_gapfilling is None: self.max_gapfilling = best_score - logger.debug(f"max_gapfilling: {self.max_gapfilling}, best_score: {best_score}") + logger.info(f"max_gapfilling: {self.max_gapfilling}, best_score: {best_score}") for media in self.media_gapfill_stats: - gfscore = 0 - if self.media_gapfill_stats[media]: - gfscore = len( - self.media_gapfill_stats[media]["new"].keys() - ) + 0.5 * len(self.media_gapfill_stats[media]["reversed"].keys()) - - logger.debug(f"media gapfilling score: {media.id}: {gfscore}") - if gfscore <= self.max_gapfilling and gfscore <= ( + if atp_att["core_atp_gapfilling"][media.id][ + "score" + ] <= self.max_gapfilling and atp_att["core_atp_gapfilling"][media.id][ + "score" + ] <= ( best_score + self.gapfilling_delta ): self.selected_media.append(media) - - def determine_growth_media2(self, max_gapfilling=None): - """ - Decides which of the test media to use as growth conditions for this model - :return: - """ - - def scoring_function(media): - return len(self.media_gapfill_stats[media]["new"].keys()) + 0.5 * len( - self.media_gapfill_stats[media]["reversed"].keys() - ) - - self.selected_media = [] - media_scores = dict( - (media, scoring_function(media)) - for media in self.media_gapfill_stats - if self.media_gapfill_stats[media] - ) - best_score = min(media_scores.values()) - if max_gapfilling is None: - max_gapfilling = best_score - for media in media_scores: - score = media_scores[media] - logger.debug(score, best_score, max_gapfilling) - if score <= max_gapfilling and score <= ( - best_score + self.gapfilling_delta - ): + atp_att["selected_media"][media.id] = 0 + elif media in self.forced_media: self.selected_media.append(media) + atp_att["selected_media"][media.id] = 0 + + + self.modelutl.save_attributes(atp_att, "ATP_analysis") def apply_growth_media_gapfilling(self): """ Applies the gapfilling to all selected growth media :return: """ + self.cumulative_core_gapfilling = [] + # TODO: In case someone runs ATP correction twice with different parameters, + # before resetting this, maybe check if any of these reactions are already in + # the model and remove them so we're starting fresh??? for media in self.selected_media: - if media in self.media_gapfill_stats and self.media_gapfill_stats[media]: - self.model = self.msgapfill.integrate_gapfill_solution( - self.media_gapfill_stats[media] + stats = self.media_gapfill_stats.get(media, None) + if ( + stats is not None + and MSGapfill.gapfill_count(self.media_gapfill_stats[media]) > 0 + ): + self.get_msgapfill().integrate_gapfill_solution( + stats, self.cumulative_core_gapfilling,check_for_growth=False ) + # Adding reactions to gapfilling sensitivity structure so we can track all gapfilled reactions + gf_sensitivity = self.modelutl.get_attributes("gf_sensitivity", {}) + if media.id not in gf_sensitivity: + gf_sensitivity[media.id] = {} + if self.atp_hydrolysis.id not in gf_sensitivity[media.id]: + gf_sensitivity[media.id][self.atp_hydrolysis.id] = {} + gf_sensitivity[media.id][self.atp_hydrolysis.id]["success"] = {} + for item in stats["new"]: + gf_sensitivity[media.id][self.atp_hydrolysis.id]["success"][ + item + ] = {stats["new"][item]: []} + for item in stats["reversed"]: + gf_sensitivity[media.id][self.atp_hydrolysis.id]["success"][ + item + ] = {stats["reversed"][item]: []} + self.modelutl.save_attributes(gf_sensitivity, "gf_sensitivity") + self.modelutl.save_attributes( + len(self.cumulative_core_gapfilling), "total_core_gapfilling" + ) def expand_model_to_genome_scale(self): """Restores noncore reactions to model while filtering out reactions that break ATP @@ -312,11 +454,11 @@ def expand_model_to_genome_scale(self): self.restore_noncore_reactions(noncore=True, othercompartment=False) # Extending model with non core reactions while retaining ATP accuracy self.filtered_noncore = self.modelutl.reaction_expansion_test( - self.noncore_reactions, tests + self.noncore_reactions, tests, attribute_label="atp_expansion_filter" ) # Removing filtered reactions for item in self.filtered_noncore: - print("Removing " + item[0].id + " " + item[1]) + logger.info("Removing " + item[0].id + " " + item[1]) if item[1] == ">": item[0].upper_bound = 0 else: @@ -326,6 +468,16 @@ def expand_model_to_genome_scale(self): self.model.remove_reactions([item[0]]) # Restoring other compartment reactions but not the core because this would undo reaction filtering self.restore_noncore_reactions(noncore=False, othercompartment=True) + # Setting core model attribute in model + core_reactions = [] + for reaction in self.model.reactions: + # check if reaction is in core template + template_reaction = self.find_reaction_in_template( + reaction, self.coretemplate, self.compartment[0:1] + ) + if template_reaction is not None: + core_reactions.append(reaction.id) + self.modelutl.save_attributes(core_reactions, "core_reactions") def restore_noncore_reactions(self, noncore=True, othercompartment=True): """ @@ -352,7 +504,7 @@ def restore_noncore_reactions(self, noncore=True, othercompartment=True): reaction.lower_bound = self.original_bounds[reaction.id][0] reaction.upper_bound = self.original_bounds[reaction.id][1] - def build_tests(self, multiplier=None): + def build_tests(self, multiplier_hash_override={}): """Build tests based on ATP media evaluations Parameters @@ -368,22 +520,66 @@ def build_tests(self, multiplier=None): Raises ------ """ - if multiplier is None: - multiplier = self.multiplier + #Checking if ATP stats have been run yet and if not, running them + if not self.selected_media: + logger.warning("ATP tests not yet computed - running without allowing for model changes!") + self.evaluate_growth_media(no_gapfilling=True) + self.determine_growth_media() + self.restore_noncore_reactions() + # Applying threshold multiplier + for key in default_threshold_multipiers: + if key not in multiplier_hash_override: + multiplier_hash_override[key] = default_threshold_multipiers[key] + # Initialzing atp test attributes + atp_att = self.modelutl.get_attributes( + "ATP_analysis", + {"tests": {}, "selected_media": {}, "core_atp_gapfilling": {}}, + ) + # Initializing tests and adding empty media every time tests = [] + if "empty" in self.media_hash: + tests.append( + { + "media": self.media_hash["empty"], + "is_max_threshold": True, + "threshold": 0.00001, + "objective": self.atp_hydrolysis.id, + } + ) + atp_att["tests"]["empty"] = { + "threshold": 0.00001, + "objective": self.atp_hydrolysis.id, + } + # Setting objective to ATP hydrolysis self.model.objective = self.atp_hydrolysis.id for media in self.selected_media: + # Setting multiplier for test threshold + multiplier = multiplier_hash_override["default"] + if media.id in multiplier_hash_override: + multiplier = multiplier_hash_override[media.id] + # Constraining model exchanges for media self.modelutl.pkgmgr.getpkg("KBaseMediaPkg").build_package(media) + # Computing core ATP production obj_value = self.model.slim_optimize() - logger.debug(f"{media.name} = {obj_value}") + logger.debug(f"{media.name} = {obj_value};{multiplier}") + threshold = multiplier * obj_value + if threshold == 0: + threshold += 0.00001 tests.append( { "media": media, "is_max_threshold": True, - "threshold": multiplier * obj_value, + "threshold": threshold, "objective": self.atp_hydrolysis.id, } ) + atp_att["selected_media"][media.id] = obj_value + atp_att["tests"][media.id] = { + "threshold": multiplier * obj_value, + "objective": self.atp_hydrolysis.id, + } + # Saving test attributes to the model + self.modelutl.save_attributes(atp_att, "ATP_analysis") return tests def run_atp_correction(self): @@ -395,7 +591,7 @@ def run_atp_correction(self): self.evaluate_growth_media() self.determine_growth_media() self.apply_growth_media_gapfilling() - self.evaluate_growth_media() + # self.evaluate_growth_media() self.expand_model_to_genome_scale() return self.build_tests() diff --git a/modelseedpy/core/msbuilder.py b/modelseedpy/core/msbuilder.py index 8a65ff70..3d4c1920 100644 --- a/modelseedpy/core/msbuilder.py +++ b/modelseedpy/core/msbuilder.py @@ -1,64 +1,84 @@ # -*- coding: utf-8 -*- import logging import itertools +from enum import Enum import cobra +from modelseedpy.core.exceptions import ModelSEEDError from modelseedpy.core.rast_client import RastClient from modelseedpy.core.msgenome import normalize_role +from modelseedpy.core.mstemplate import TemplateReactionType from modelseedpy.core.msmodel import ( get_gpr_string, get_reaction_constraints_from_direction, ) -from cobra.core import Gene, Metabolite, Model, Reaction +from cobra.core import Gene, Metabolite, Model, Reaction, Group from modelseedpy.core import FBAHelper +from modelseedpy.core.msmodel import MSModel from modelseedpy.fbapkg.mspackagemanager import MSPackageManager +from modelseedpy.biochem.modelseed_biochem import ModelSEEDBiochem +from modelseedpy.biochem.modelseed_to_cobra import modelseed_to_cobra_reaction SBO_ANNOTATION = "sbo" +DEFAULT_SINKS = { + "cpd02701_c": 1000, # S-Adenosyl-4-methylthio-2-oxobutanoate + "cpd11416_c": 1000, # Biomass + "cpd15302_c": 1000, # glycogen(n-1) + "cpd03091_c": 1000, # 5'-Deoxyadenosine + "cpd01042_c": 1000, # p-Cresol +} + logger = logging.getLogger(__name__) ### temp stuff ### core_biomass = { - "cpd00032_c0": -1.7867, - "cpd00005_c0": -1.8225, - "cpd00169_c0": -1.496, - "cpd11416_c0": 1, - "cpd00003_c0": -3.547, - "cpd00008_c0": 41.257, - "cpd00024_c0": -1.0789, - "cpd00009_c0": 41.257, - "cpd00102_c0": -0.129, - "cpd00101_c0": -0.8977, - "cpd00236_c0": -0.8977, - "cpd00002_c0": -41.257, - "cpd00022_c0": -3.7478, - "cpd00020_c0": -2.8328, - "cpd00006_c0": 1.8225, - "cpd00001_c0": -41.257, - "cpd00072_c0": -0.0709, - "cpd00010_c0": 3.7478, - "cpd00004_c0": 3.547, - "cpd00061_c0": -0.5191, - "cpd00067_c0": 46.6265, - "cpd00079_c0": -0.205, + "cpd00032_c": -1.7867, + "cpd00005_c": -1.8225, + "cpd00169_c": -1.496, + "cpd11416_c": 1, + "cpd00003_c": -3.547, + "cpd00008_c": 41.257, + "cpd00024_c": -1.0789, + "cpd00009_c": 41.257, + "cpd00102_c": -0.129, + "cpd00101_c": -0.8977, + "cpd00236_c": -0.8977, + "cpd00002_c": -41.257, + "cpd00022_c": -3.7478, + "cpd00020_c": -2.8328, + "cpd00006_c": 1.8225, + "cpd00001_c": -41.257, + "cpd00072_c": -0.0709, + "cpd00010_c": 3.7478, + "cpd00004_c": 3.547, + "cpd00061_c": -0.5191, + "cpd00067_c": 46.6265, + "cpd00079_c": -0.205, } core_atp2 = { - "cpd00067_c0": 46.6265, - "cpd00002_c0": -41.257, - "cpd00008_c0": 41.257, - "cpd00001_c0": -41.257, - "cpd00009_c0": 41.257, + "cpd00067_c": 46.6265, + "cpd00002_c": -41.257, + "cpd00008_c": 41.257, + "cpd00001_c": -41.257, + "cpd00009_c": 41.257, } core_atp = { - "cpd00067_c0": 1, - "cpd00002_c0": -1, - "cpd00008_c0": 1, - "cpd00001_c0": -1, - "cpd00009_c0": 1, + "cpd00067_c": 1, + "cpd00002_c": -1, + "cpd00008_c": 1, + "cpd00001_c": -1, + "cpd00009_c": 1, } gramneg = { + "cpd11463_c0": 1, + "cpd00008_c0": 40, + "cpd00067_c0": 40, + "cpd00009_c0": 40, + "cpd00001_c0": -40, + "cpd00002_c0": -40, "cpd00166_c0": -0.00280615915959131, "cpd00087_c0": -0.00280615915959131, "cpd15560_c0": -0.00280615915959131, @@ -70,162 +90,100 @@ "cpd00220_c0": -0.00280615915959131, "cpd00003_c0": -0.00280615915959131, "cpd00557_c0": -0.00280615915959131, - "cpd00002_c0": -40.1101757365074, - "cpd00023_c0": -0.219088153012743, - "cpd00062_c0": -0.0908319049068452, "cpd00050_c0": -0.00280615915959131, - "cpd00008_c0": 40, "cpd00264_c0": -0.00280615915959131, "cpd00010_c0": -0.00280615915959131, "cpd15533_c0": -0.0311453449430676, - "cpd11416_c0": 1, "cpd15540_c0": -0.0311453449430676, "cpd00048_c0": -0.00280615915959131, - "cpd00035_c0": -0.427934380173264, - "cpd17042_c0": -1, "cpd00030_c0": -0.00280615915959131, "cpd00034_c0": -0.00280615915959131, - "cpd00161_c0": -0.211072732780569, "cpd00201_c0": -0.00280615915959131, "cpd00016_c0": -0.00280615915959131, "cpd00104_c0": -0.00280615915959131, - "cpd00067_c0": 40, "cpd11493_c0": -0.00280615915959131, - "cpd00051_c0": -0.246696822701341, "cpd00017_c0": -0.00280615915959131, - "cpd00357_c0": -0.0157642107352084, - "cpd17041_c0": -1, - "cpd00038_c0": -0.135406821203723, - "cpd00107_c0": -0.375388847540127, "cpd00042_c0": -0.00280615915959131, "cpd00149_c0": -0.00280615915959131, "cpd00058_c0": -0.00280615915959131, - "cpd00041_c0": -0.200830806928348, - "cpd00129_c0": -0.184354665339991, - "cpd15432_c0": -0.0250105977108944, - "cpd00052_c0": -0.0841036156544863, - "cpd00012_c0": 0.484600235732628, + "cpd03736_c0": -0.0250105977108944, "cpd15352_c0": -0.00280615915959131, - "cpd00322_c0": -0.241798510337235, - "cpd00053_c0": -0.219088153012743, "cpd00006_c0": -0.00280615915959131, "cpd00345_c0": -0.00280615915959131, "cpd00063_c0": -0.00280615915959131, - "cpd00033_c0": -0.509869786991038, - "cpd00066_c0": -0.154519490031345, - "cpd17043_c0": -1, "cpd00118_c0": -0.00280615915959131, - "cpd00009_c0": 39.9971938408404, "cpd15793_c0": -0.0311453449430676, - "cpd00356_c0": -0.01627686799489, "cpd01997_c0": 0.00280615915959131, - "cpd00132_c0": -0.200830806928348, - "cpd00060_c0": -0.127801422590767, "cpd00037_c0": -0.00280615915959131, - "cpd00115_c0": -0.0157642107352084, "cpd00099_c0": -0.00280615915959131, - "cpd00156_c0": -0.352233189091625, "cpd02229_c0": -0.0250105977108944, - "cpd00069_c0": -0.120676604606612, - "cpd00065_c0": -0.0472019191450218, - "cpd00241_c0": -0.01627686799489, "cpd15666_c0": 0.0250105977108944, "cpd10516_c0": -0.00280615915959131, - "cpd00084_c0": -0.0761464922056484, "cpd00056_c0": -0.00280615915959131, - "cpd00119_c0": -0.0792636000737159, - "cpd00001_c0": -35.5403092430435, "cpd03422_c0": 0.00280615915959131, "cpd00015_c0": -0.00280615915959131, - "cpd00054_c0": -0.179456352975885, "cpd00205_c0": -0.00280615915959131, - "cpd00039_c0": -0.285438020490179, "cpd00254_c0": -0.00280615915959131, + "cpd11463_c0": -0.5, + "cpd11461_c0": -0.1, + "cpd11462_c0": -0.2, } grampos = { - "cpd00241_c0": -0.0116907079028565, + "cpd11416_c0": 1, + "cpd00001_c0": -40, + "cpd00009_c0": 40, + "cpd00008_c0": 40, + "cpd00002_c0": -40, + "cpd00067_c0": 40, "cpd00017_c0": -0.00719527989638797, - "cpd00033_c0": -0.409331301687739, - "cpd00066_c0": -0.176188648374102, - "cpd17043_c0": -1, "cpd03422_c0": 0.00719527989638797, - "cpd17041_c0": -1, "cpd00557_c0": -0.00719527989638797, - "cpd00129_c0": -0.161028229793075, "cpd00166_c0": -0.00719527989638797, "cpd00030_c0": -0.00719527989638797, "cpd00087_c0": -0.00719527989638797, "cpd00015_c0": -0.00719527989638797, - "cpd00065_c0": -0.0544955586831525, - "cpd00357_c0": -0.0151844826784228, - "cpd00009_c0": 41.2498047201036, "cpd00038_c0": -0.0424026391792249, "cpd15667_c0": -0.00309563020839783, - "cpd00069_c0": -0.111039822579957, "cpd15540_c0": -0.0251172136637642, - "cpd00161_c0": -0.186841915485094, "cpd15748_c0": -0.00309563020839783, - "cpd00035_c0": -0.267560900902997, "cpd00048_c0": -0.00719527989638797, "cpd12370_c0": 0.00719527989638797, "cpd00052_c0": -0.0261242266150642, "cpd15757_c0": -0.00309563020839783, - "cpd00053_c0": -0.261005044219309, "cpd15533_c0": -0.0251172136637642, - "cpd00002_c0": -41.2913947104178, "cpd00006_c0": -0.00719527989638797, - "cpd00084_c0": -0.0569540049395353, "cpd10515_c0": -0.00719527989638797, "cpd00104_c0": -0.00719527989638797, - "cpd00051_c0": -0.193397772168782, "cpd00028_c0": -0.00719527989638797, "cpd00118_c0": -0.00719527989638797, - "cpd00107_c0": -0.347460404235438, "cpd00037_c0": -0.00719527989638797, "cpd15793_c0": -0.0251172136637642, "cpd00010_c0": -0.00719527989638797, "cpd11493_c0": -0.00719527989638797, "cpd00264_c0": -0.00719527989638797, "cpd15766_c0": -0.00309563020839783, - "cpd00041_c0": -0.14832625746843, "cpd00056_c0": -0.00719527989638797, "cpd01997_c0": 0.00719527989638797, "cpd15668_c0": -0.00309563020839783, "cpd00254_c0": -0.00719527989638797, - "cpd11416_c0": 1, "cpd02229_c0": -0.00309563020839783, "cpd00003_c0": -0.00719527989638797, - "cpd00008_c0": 41.257, - "cpd17042_c0": -1, - "cpd00023_c0": -0.261005044219309, "cpd15665_c0": -0.00309563020839783, "cpd11459_c0": -0.00309563020839783, "cpd15666_c0": 0.0123825208335913, - "cpd00115_c0": -0.0151844826784228, "cpd00050_c0": -0.00719527989638797, "cpd00063_c0": -0.00719527989638797, "cpd00205_c0": -0.00719527989638797, - "cpd00054_c0": -0.216753011604418, "cpd00042_c0": -0.00719527989638797, "cpd00034_c0": -0.00719527989638797, "cpd15500_c0": -0.00719527989638797, - "cpd00156_c0": -0.307715523090583, - "cpd00132_c0": -0.14832625746843, - "cpd00067_c0": -41.257, "cpd15775_c0": -0.00309563020839783, - "cpd00119_c0": -0.0819482085460939, - "cpd00060_c0": -0.11349826883634, - "cpd00001_c0": 45.354000686262, "cpd00099_c0": -0.00719527989638797, - "cpd00356_c0": -0.0116907079028565, "cpd00220_c0": -0.00719527989638797, - "cpd00322_c0": -0.27042908820211, "cpd00062_c0": -0.0282246669459237, "cpd00345_c0": -0.00719527989638797, - "cpd00012_c0": 0.184896624320595, "cpd10516_c0": -0.00719527989638797, - "cpd00039_c0": -0.323695423757071, "cpd00201_c0": -0.00719527989638797, "cpd15669_c0": -0.00309563020839783, "cpd15560_c0": -0.00719527989638797, @@ -233,9 +191,19 @@ "cpd00058_c0": -0.00719527989638797, "cpd00016_c0": -0.00719527989638797, "cpd15352_c0": -0.00719527989638797, + "cpd11463_c0": -0.5, + "cpd11461_c0": -0.1, + "cpd11462_c0": -0.2, } +class MSGenomeClass(Enum): + P = "Gram Positive" + N = "Gram Negative" + C = "Cyano" + A = "Archaea" + + def build_biomass(rxn_id, cobra_model, template, biomass_compounds, index="0"): bio_rxn = Reaction(rxn_id, "biomass", "", 0, 1000) metabolites = {} @@ -326,13 +294,143 @@ def build_gpr(cpx_gene_role): class MSBuilder: - def __init__(self, genome, template=None): + def __init__( + self, genome, template=None, name=None, ontology_term="RAST", index="0" + ): """ - for future methods with better customization + + @param genome: MSGenome + @param template: MSTemplate + @param name: + @param ontology_term: """ + if index is None or type(index) != str: + raise TypeError("index must be str") + if ontology_term is None or type(ontology_term) != str: + raise TypeError("ontology_term must be str") + self.name = name self.genome = genome self.template = template - self.search_name_to_genes, self.search_name_to_original = _aaaa(genome, "RAST") + self.genome_class = None + self.search_name_to_genes, self.search_name_to_original = _aaaa( + genome, ontology_term + ) + self.template_species_to_model_species = None + self.reaction_to_complex_sets = None + self.compartments = None + self.base_model = None + self.compartments_index = None # TODO: implement custom index by compartment + self.index = index + + def build_drains(self): + if self.template_species_to_model_species is None: + logger.warning("cannot build model drains without generating model species") + return None + if self.template.drains: + sinks = self.build_sinks() + demands = self.build_demands() + return sinks + demands + else: + # template without drain specification we build only default sinks + return self.build_sinks() + + def build_sinks(self): + if self.template_species_to_model_species is None: + logger.warning("cannot build model sinks without generating model species") + return None + if self.template.drains: + sinks = { + x.id: t[1] + for x, t in self.template.drains.items() + if t[1] > 0 and x.id in self.template_species_to_model_species + } + return [self.build_sink_reaction(x, v) for x, v in sinks.items()] + else: + # template without drain specification we build only default sinks + in_model = { + k: v + for k, v in DEFAULT_SINKS.items() + if k in self.template_species_to_model_species + } + return [self.build_sink_reaction(x, v) for x, v in in_model.items()] + + def build_demands(self): + if self.template_species_to_model_species is None: + logger.warning("cannot build model sinks without generating model species") + return None + if self.template.drains: + demands = { + x.id: t[0] + for x, t in self.template.drains.items() + if t[0] < 0 and x.id in self.template_species_to_model_species + } + return [self.build_demand_reaction(x, v) for x, v in demands.items()] + else: + return [] + + def build_drain_reaction( + self, + template_cpd_id, + prefix="EX_", + name_prefix="Exchange for ", + subsystem="exchanges", + lower_bound=0, + upper_bound=1000, + sbo_term="SBO:0000627", + ): + """ + SK_ for sink (SBO_0000632) DM_ for demand (SBO_0000628) EX_ for exchange (SBO_0000627) + @param template_cpd_id: + @param prefix: + @param name_prefix: + @param subsystem: + @param lower_bound: + @param upper_bound: + @param sbo_term: + @return: + """ + + if self.template_species_to_model_species is None: + logger.warning("cannot build model drains without generating model species") + return None + else: + m = self.template_species_to_model_species[template_cpd_id] + drain = Reaction( + f"{prefix}{m.id}", + f"{name_prefix}{m.name}", + subsystem, + lower_bound, + upper_bound, + ) + drain.add_metabolites({m: -1}) + drain.annotation[SBO_ANNOTATION] = sbo_term + return drain + + def build_sink_reaction(self, template_cpd_id, upper_bound): + if upper_bound <= 0: + raise ModelSEEDError("Sink reactions must have upper bound > 0") + return self.build_drain_reaction( + template_cpd_id, + "SK_", + "Sink for ", + "exchanges", + 0, + upper_bound, + "SBO:0000632", + ) + + def build_demand_reaction(self, template_cpd_id, lower_bound): + if lower_bound >= 0: + raise ModelSEEDError("Demand reactions must have lower bound < 0") + return self.build_drain_reaction( + template_cpd_id, + "DM_", + "Demand for ", + "exchanges", + lower_bound, + 0, + "SBO:0000628", + ) def _get_template_reaction_complexes(self, template_reaction): """ @@ -434,40 +532,7 @@ def get_gpr_from_template_reaction( return gpr_set @staticmethod - def _build_reaction(reaction_id, gpr_set, template, index="0", sbo=None): - template_reaction = template.reactions.get_by_id(reaction_id) - - reaction_compartment = template_reaction.compartment - metabolites = {} - - for cpd, value in template_reaction.metabolites.items(): - compartment = f"{cpd.compartment}{index}" - name = f"{cpd.name}_{compartment}" - cpd = Metabolite( - cpd.id + str(index), cpd.formula, name, cpd.charge, compartment - ) - metabolites[cpd] = value - - reaction = Reaction( - "{}{}".format(template_reaction.id, index), - "{}_{}{}".format(template_reaction.name, reaction_compartment, index), - "", - template_reaction.lower_bound, - template_reaction.upper_bound, - ) - - gpr_str = build_gpr2(gpr_set) if gpr_set else "" - reaction.add_metabolites(metabolites) - if gpr_str and len(gpr_str) > 0: - reaction.gene_reaction_rule = gpr_str # get_gpr_string(gpr_ll) - - reaction.annotation["seed.reaction"] = template_reaction.reference_id - if sbo: - reaction.annotation[SBO_ANNOTATION] = sbo - return reaction - - @staticmethod - def build_exchanges(model, extra_cell="e0"): + def add_exchanges_to_model(model, extra_cell="e0"): """ Build exchange reactions for the "extra_cell" compartment :param model: Cobra Model @@ -494,15 +559,25 @@ def build_exchanges(model, extra_cell="e0"): return reactions_exchanges @staticmethod - def build_biomasses(model, template, index): + def get_or_create_metabolite( + model, template, cpd_base_id, compartment="c", index=0 + ): + if isinstance(index, int): + index = str(index) + full_id = cpd_base_id + "_" + compartment + index + if full_id not in model.metabolites: + pass + return model.metabolites.get_by_id(full_id) + + def build_static_biomasses(self, model, template): res = [] if template.name.startswith("CoreModel"): - res.append(build_biomass("bio1", model, template, core_biomass, index)) - res.append(build_biomass("bio2", model, template, core_atp, index)) + res.append(self.build_biomass("bio1", model, template, core_biomass)) + res.append(self.build_biomass("bio2", model, template, core_atp)) if template.name.startswith("GramNeg"): - res.append(build_biomass("bio1", model, template, gramneg, index)) + res.append(self.build_biomass("bio1", model, template, gramneg)) if template.name.startswith("GramPos"): - res.append(build_biomass("bio1", model, template, grampos, index)) + res.append(self.build_biomass("bio1", model, template, grampos)) return res def auto_select_template(self): @@ -513,9 +588,10 @@ def auto_select_template(self): from modelseedpy.helpers import get_template, get_classifier from modelseedpy.core.mstemplate import MSTemplateBuilder - genome_classifier = get_classifier("knn_ACNP_RAST_filter") - genome_class = genome_classifier.classify(self.genome) + genome_classifier = get_classifier("knn_ACNP_RAST_filter_01_17_2023") + self.genome_class = genome_classifier.classify(self.genome) + # TODO: update with enum MSGenomeClass template_genome_scale_map = { "A": "template_gram_neg", "C": "template_gram_neg", @@ -530,68 +606,351 @@ def auto_select_template(self): } if ( - genome_class in template_genome_scale_map - and genome_class in template_core_map + self.genome_class in template_genome_scale_map + and self.genome_class in template_core_map ): self.template = MSTemplateBuilder.from_dict( - get_template(template_genome_scale_map[genome_class]) + get_template(template_genome_scale_map[self.genome_class]) ).build() elif self.template is None: - raise Exception(f"unable to select template for {genome_class}") + raise Exception(f"unable to select template for {self.genome_class}") - return genome_class + return self.genome_class - def build_metabolic_reactions(self, index="0", allow_incomplete_complexes=True): - metabolic_reactions = {} + def generate_reaction_complex_sets(self, allow_incomplete_complexes=True): + self.reaction_to_complex_sets = {} for template_reaction in self.template.reactions: gpr_set = self.get_gpr_from_template_reaction( template_reaction, allow_incomplete_complexes ) if gpr_set: - metabolic_reactions[template_reaction.id] = gpr_set + self.reaction_to_complex_sets[template_reaction.id] = gpr_set logger.debug("[%s] gpr set: %s", template_reaction.id, gpr_set) - reactions = list( - map( - lambda x: self._build_reaction( - x[0], x[1], self.template, index, "SBO:0000176" - ), - metabolic_reactions.items(), + return self.reaction_to_complex_sets + + """ + def _build_reaction(self, reaction_id, gpr_set, template, index="0", sbo=None): + template_reaction = template.reactions.get_by_id(reaction_id) + + reaction_compartment = template_reaction.compartment + metabolites = {} + + for cpd, value in template_reaction.metabolites.items(): + compartment = f"{cpd.compartment}{index}" + name = f"{cpd.name}_{compartment}" + cpd = Metabolite( + cpd.id + str(index), cpd.formula, name, cpd.charge, compartment ) + metabolites[cpd] = value + + reaction = Reaction( + "{}{}".format(template_reaction.id, index), + "{}_{}{}".format(template_reaction.name, reaction_compartment, index), + "", + template_reaction.lower_bound, + template_reaction.upper_bound, ) + gpr_str = build_gpr2(gpr_set) if gpr_set else "" + reaction.add_metabolites(metabolites) + if gpr_str and len(gpr_str) > 0: + reaction.gene_reaction_rule = gpr_str # get_gpr_string(gpr_ll) + + reaction.annotation["seed.reaction"] = template_reaction.reference_id + if sbo: + reaction.annotation[SBO_ANNOTATION] = sbo + return reaction + """ + + def build_complex_groups(self, complex_sets): + """ + Builds complex Group from complex sets computed from template and genome + Example: {'cpx00700': {'ftr01608': {'b3177'}}, 'cpx01370': {'ftr01607': {'b0142'}}} + @param complex_sets: + @return: + """ + group_complexes = {} + for complex_set in complex_sets: + for complex_id in complex_set: + if ( + complex_id not in group_complexes + and complex_id in self.template.complexes + ): + cpx = self.template.complexes.get_by_id(complex_id) + g = Group(complex_id) + g.notes["complex_source"] = cpx.source + for role, (t, o) in cpx.roles.items(): + if role.id in complex_set[complex_id]: + g.notes[f"complex_subunit_note_{role.id}"] = role.name + g.notes[f"complex_subunit_optional_{role.id}"] = ( + 1 if o else 0 + ) + g.notes[f"complex_subunit_triggering_{role.id}"] = ( + 1 if t else 0 + ) + g.notes[f"complex_subunit_features_{role.id}"] = ";".join( + sorted(list(complex_set[complex_id][role.id])) + ) + group_complexes[g.id] = g + + return group_complexes + + def build_metabolic_reactions(self): + if self.base_model is None: + raise ModelSEEDError( + "unable to generate metabolic reactions without base model" + ) + if self.reaction_to_complex_sets is None: + raise ModelSEEDError( + "unable to generate metabolic reactions without generate complex sets" + ) + + if self.template_species_to_model_species is None: + self.template_species_to_model_species = {} + if self.compartments is None: + self.compartments = {} + + reactions = [] + for rxn_id, complex_set in self.reaction_to_complex_sets.items(): + template_reaction = self.template.reactions.get_by_id(rxn_id) + for m in template_reaction.metabolites: + if m.compartment not in self.compartments: + self.compartments[ + m.compartment + ] = self.template.compartments.get_by_id(m.compartment) + if m.id not in self.template_species_to_model_species: + model_metabolite = m.to_metabolite(self.index) + self.template_species_to_model_species[m.id] = model_metabolite + self.base_model.add_metabolites([model_metabolite]) + reaction = template_reaction.to_reaction(self.base_model, self.index) + gpr_str = build_gpr2(complex_set) if complex_set else "" + if gpr_str and len(gpr_str) > 0: + reaction.gene_reaction_rule = gpr_str + reaction.annotation[SBO_ANNOTATION] = "SBO:0000176" + reaction.notes["modelseed_complex"] = ";".join(sorted(list(complex_set))) + reactions.append(reaction) + return reactions + def build_from_annotaton_ontology( + self, + model_or_id, + anno_ont, + index="0", + allow_all_non_grp_reactions=False, + annotate_with_rast=False, + biomass_classic=False, + biomass_gc=0.5, + add_non_template_reactions=True, + prioritized_event_list=None, + ontologies=None, + merge_all=True, + convert_to_sso=True, + ): + # Build base model without annotation + self.search_name_to_orginal = {} + self.search_name_to_genes = {} + gene_term_hash = anno_ont.get_gene_term_hash( + prioritized_event_list, ontologies, merge_all, convert_to_sso + ) + residual_reaction_gene_hash = {} + for gene in gene_term_hash: + for term in gene_term_hash[gene]: + if term.ontology.id == "SSO": + name = anno_ont.get_term_name(term) + f_norm = normalize_role(name) + if f_norm not in self.search_name_to_genes: + self.search_name_to_genes[f_norm] = set() + self.search_name_to_orginal[f_norm] = set() + self.search_name_to_orginal[f_norm].add(name) + self.search_name_to_genes[f_norm].add(gene.id) + else: + for rxn_id in term.msrxns: + if rxn_id not in residual_reaction_gene_hash: + residual_reaction_gene_hash[rxn_id] = {} + if gene not in residual_reaction_gene_hash[rxn_id]: + residual_reaction_gene_hash[rxn_id][gene] = [] + residual_reaction_gene_hash[rxn_id][gene] = gene_term_hash[ + gene + ][term] + + model_or_id = self.build( + model_or_id, + index, + allow_all_non_grp_reactions, + annotate_with_rast, + biomass_classic, + biomass_gc, + ) + for rxn in model_or_id.reactions: + probability = None + for gene in rxn.genes(): + annoont_gene = anno_ont.get_feature(gene.id) + if annoont_gene and annoont_gene in gene_term_hash: + for term in gene_term_hash[annoont_gene]: + if rxn.id[0:-3] in term.msrxns: + for item in gene_term_hash[gene][term]: + if "probability" in item.scores: + if ( + not probability + or item.scores["probability"] > probability + ): + probability = item.scores["probability"] + if hasattr(rxn, "probability"): + rxn.probability = probability + + reactions = [] + modelseeddb = ModelSEEDBiochem.get() + for rxn_id in residual_reaction_gene_hash: + if rxn_id + "_c0" not in model_or_id.reactions: + reaction = None + template_reaction = None + if rxn_id + "_c" in self.template.reactions: + template_reaction = self.template.reactions.get_by_id(rxn_id + "_c") + elif rxn_id in modelseeddb.reactions: + msrxn = modelseeddb.reactions.get_by_id(rxn_id) + template_reaction = msrxn.to_template_reaction({0: "c", 1: "e"}) + if template_reaction: + for m in template_reaction.metabolites: + if m.compartment not in self.compartments: + self.compartments[ + m.compartment + ] = self.template.compartments.get_by_id(m.compartment) + if m.id not in self.template_species_to_model_species: + model_metabolite = m.to_metabolite(self.index) + self.template_species_to_model_species[ + m.id + ] = model_metabolite + self.base_model.add_metabolites([model_metabolite]) + reaction = template_reaction.to_reaction( + self.base_model, self.index + ) + gpr = "" + probability = None + for gene in residual_reaction_gene_hash[rxn_id]: + for item in residual_reaction_gene_hash[rxn_id][gene]: + if "probability" in item["scores"]: + if ( + not probability + or item["scores"]["probability"] > probability + ): + probability = item["scores"]["probability"] + if len(gpr) > 0: + gpr += " or " + gpr += gene.id + if hasattr(rxn, "probability"): + reaction.probability = probability + reaction.gene_reaction_rule = gpr + reaction.annotation[SBO_ANNOTATION] = "SBO:0000176" + reactions.append(reaction) + if not reaction: + print("Reaction ", rxn_id, " not found in template or database!") + + model_or_id.add_reactions(reactions) + return model_or_id + def build_non_metabolite_reactions( - self, cobra_model, index="0", allow_all_non_grp_reactions=False + self, cobra_model, allow_all_non_grp_reactions=False ): - reactions_no_gpr = [] - reactions_in_model = set(map(lambda x: x.id, cobra_model.reactions)) - metabolites_in_model = set(map(lambda x: x.id, cobra_model.metabolites)) - for rxn in self.template.reactions: - if rxn.type == "universal" or rxn.type == "spontaneous": - reaction = self._build_reaction( - rxn.id, {}, self.template, index, "SBO:0000176" - ) - reaction_metabolite_ids = set( - map(lambda x: x.id, set(reaction.metabolites)) - ) + if self.base_model is None: + raise ModelSEEDError( + "unable to generate metabolic reactions without base model" + ) + if self.reaction_to_complex_sets is None: + raise ModelSEEDError( + "unable to generate metabolic reactions without generate complex sets" + ) + + if self.template_species_to_model_species is None: + self.template_species_to_model_species = {} + if self.compartments is None: + self.compartments = {} + + reactions = [] + for template_reaction in self.template.reactions: + rxn_type = template_reaction.type + if ( + rxn_type == "universal" + or rxn_type == "spontaneous" + or rxn_type == TemplateReactionType.UNIVERSAL + or rxn_type == TemplateReactionType.SPONTANEOUS + ): + reaction_metabolite_ids = {m.id for m in template_reaction.metabolites} if ( - len(metabolites_in_model & reaction_metabolite_ids) > 0 + len( + set(self.template_species_to_model_species) + & reaction_metabolite_ids + ) + > 0 or allow_all_non_grp_reactions - ) and reaction.id not in reactions_in_model: - reaction.annotation["seed.reaction"] = rxn.id - reactions_no_gpr.append(reaction) + ): + for m in template_reaction.metabolites: + if m.compartment not in self.compartments: + self.compartments[ + m.compartment + ] = self.template.compartments.get_by_id(m.compartment) + if m.id not in self.template_species_to_model_species: + model_metabolite = m.to_metabolite(self.index) + self.template_species_to_model_species[ + m.id + ] = model_metabolite + self.base_model.add_metabolites([model_metabolite]) + + reaction = template_reaction.to_reaction( + self.base_model, self.index + ) + reaction.annotation[SBO_ANNOTATION] = "SBO:0000672" + # if template_reaction.type == "spontaneous": + # reaction.annotation[SBO_ANNOTATION] = "SBO:0000176" - return reactions_no_gpr + if reaction.id not in cobra_model.reactions: + reactions.append(reaction) + + return reactions + + def build_biomass(self, rxn_id, cobra_model, template, biomass_compounds): + bio_rxn = Reaction(rxn_id, "biomass", "", 0, 1000) + metabolites = {} + for template_cpd_id in biomass_compounds: + if template_cpd_id in self.template_species_to_model_species: + model_species_id = self.template_species_to_model_species[ + template_cpd_id + ].id + cpd = cobra_model.metabolites.get_by_id(model_species_id) + metabolites[cpd] = biomass_compounds[template_cpd_id] + else: + template_cpd = template.compcompounds.get_by_id(template_cpd_id[:-1]) + m = template_cpd.to_metabolite(self.index) + metabolites[m] = biomass_compounds[template_cpd_id] + self.template_species_to_model_species[template_cpd_id] = m + cobra_model.add_metabolites([m]) + bio_rxn.add_metabolites(metabolites) + bio_rxn.annotation[SBO_ANNOTATION] = "SBO:0000629" + return bio_rxn def build( self, - model_id, + model_or_id, index="0", allow_all_non_grp_reactions=False, annotate_with_rast=True, + biomass_classic=False, + biomass_gc=0.5, + add_reaction_from_rast_annotation=True, ): + """ + + @param model_or_id: a string ID to build from cobra.core.Model otherwise a type of cobra.core.Model + as Base Model + @param index: + @param allow_all_non_grp_reactions: + @param annotate_with_rast: + @param biomass_classic: + @param biomass_gc: + @return: + """ + self.index = index if annotate_with_rast: rast = RastClient() @@ -604,28 +963,83 @@ def build( if self.template is None: self.auto_select_template() - cobra_model = Model(model_id) - cobra_model.add_reactions(self.build_metabolic_reactions(index=index)) - cobra_model.add_reactions( - self.build_non_metabolite_reactions( - cobra_model, index, allow_all_non_grp_reactions - ) + cobra_model = model_or_id + if type(model_or_id) == str: + from cobra.core import Model + + cobra_model = Model(model_or_id) + + self.base_model = cobra_model + + self.generate_reaction_complex_sets() + complex_groups = self.build_complex_groups( + self.reaction_to_complex_sets.values() ) - self.build_exchanges(cobra_model) + if add_reaction_from_rast_annotation: + metabolic_reactions = self.build_metabolic_reactions() + cobra_model.add_reactions(metabolic_reactions) + + non_metabolic_reactions = self.build_non_metabolite_reactions( + cobra_model, allow_all_non_grp_reactions + ) + cobra_model.add_reactions(non_metabolic_reactions) + cobra_model.add_groups(list(complex_groups.values())) + self.add_exchanges_to_model(cobra_model) + + biomass_reactions = [] + for rxn_biomass in self.template.biomasses: + reaction = rxn_biomass.build_biomass( + cobra_model, index, biomass_classic, biomass_gc + ) + for m in reaction.metabolites: + if "modelseed_template_id" in m.notes: + self.template_species_to_model_species[ + m.notes["modelseed_template_id"] + ] = m + biomass_reactions.append(reaction) + + if len(biomass_reactions) > 0: + for rxn in biomass_reactions: + if rxn.id not in cobra_model.reactions: + cobra_model.add_reactions([rxn]) + cobra_model.objective = biomass_reactions[0].id + + """ if ( self.template.name.startswith("CoreModel") or self.template.name.startswith("GramNeg") or self.template.name.startswith("GramPos") ): - cobra_model.add_reactions( - self.build_biomasses(cobra_model, self.template, index) - ) + gc = 0.5 + if hasattr(self.genome,"info"): + gc = float(self.genome.info.metadata["GC content"]) + print("Genome custom GC:",gc) + for bio in self.template.biomasses: + bio.build_biomass(cobra_model, index, classic=False, GC=gc,add_to_model=True) cobra_model.objective = "bio1" + """ - reactions_sinks = [] + reactions_sinks = self.build_drains() + cobra_model.add_reactions(reactions_sinks) + + compartment_data = {} + for cmp_id, data in self.compartments.items(): + cmp_index_id = f"{cmp_id}{self.index}" + compartment_data[cmp_index_id] = data.name + kbase_compartment_data_key = f"kbase_compartment_data_{cmp_index_id}" + kbase_compartment_data = { + "pH": data.ph, + "potential": 0, + "compartmentIndex": self.index, + } + cobra_model.notes[kbase_compartment_data_key] = kbase_compartment_data + + cobra_model.compartments = compartment_data + + """ for cpd_id in ["cpd02701_c0", "cpd11416_c0", "cpd15302_c0"]: - if cpd_id in cobra_model.metabolites: + if cpd_id in cobra_model.metabolites: m = cobra_model.metabolites.get_by_id(cpd_id) rxn_exchange = Reaction( "SK_" + m.id, "Sink for " + m.name, "exchanges", 0, 1000 @@ -633,7 +1047,7 @@ def build( rxn_exchange.add_metabolites({m: -1}) rxn_exchange.annotation[SBO_ANNOTATION] = "SBO:0000627" reactions_sinks.append(rxn_exchange) - cobra_model.add_reactions(reactions_sinks) + """ return cobra_model @@ -646,33 +1060,32 @@ def build_full_template_model(template, model_id=None, index="0"): :param index: index for the metabolites :return: """ - model = Model(model_id if model_id else template.id) + from modelseedpy.core.msmodel import MSModel + + model = MSModel(model_id if model_id else template.id, template=template) all_reactions = [] for rxn in template.reactions: - reaction = MSBuilder._build_reaction( - rxn.id, {}, template, index, "SBO:0000176" - ) + reaction = rxn.to_reaction(model, index) reaction.annotation["seed.reaction"] = rxn.id all_reactions.append(reaction) model.add_reactions(all_reactions) - model.add_reactions(MSBuilder.build_exchanges(model)) + MSBuilder.add_exchanges_to_model(model) if template.name.startswith("CoreModel"): bio_rxn1 = build_biomass("bio1", model, template, core_biomass, index) bio_rxn2 = build_biomass("bio2", model, template, core_atp, index) model.add_reactions([bio_rxn1, bio_rxn2]) model.objective = "bio1" - if template.name.startswith("GramNeg"): - bio_rxn1 = build_biomass("bio1", model, template, gramneg, index) - model.add_reactions([bio_rxn1]) - model.objective = "bio1" - if template.name.startswith("GramPos"): - bio_rxn1 = build_biomass("bio1", model, template, grampos, index) - model.add_reactions([bio_rxn1]) - model.objective = "bio1" + else: + for bio in template.biomasses: + bio.build_biomass( + model, index, classic=False, GC=0.5, add_to_model=True + ) + if "bio1" in model.reactions: + model.objective = "bio1" reactions_sinks = [] - for cpd_id in ["cpd02701_c0", "cpd11416_c0", "cpd15302_c0"]: + for cpd_id in ["cpd02701_c0", "cpd11416_c0", "cpd15302_c0", "cpd03091_c0"]: if cpd_id in model.metabolites: m = model.metabolites.get_by_id(cpd_id) rxn_exchange = Reaction( @@ -694,10 +1107,15 @@ def build_metabolic_model( allow_all_non_grp_reactions=False, annotate_with_rast=True, gapfill_model=True, + classic_biomass=False, ): builder = MSBuilder(genome, template) model = builder.build( - model_id, index, allow_all_non_grp_reactions, annotate_with_rast + model_id, + index, + allow_all_non_grp_reactions, + annotate_with_rast, + classic_biomass, ) # Gapfilling model if gapfill_model: diff --git a/modelseedpy/core/msensemble.py b/modelseedpy/core/msensemble.py new file mode 100755 index 00000000..2ef522b2 --- /dev/null +++ b/modelseedpy/core/msensemble.py @@ -0,0 +1,300 @@ +# -*- coding: utf-8 -*- +import logging +import re +import time +import json +import sys +import pandas as pd +import cobra +import random +from cobra.core.dictlist import DictList +from optlang.symbolics import Zero, add +from modelseedpy.fbapkg.mspackagemanager import MSPackageManager +from modelseedpy.core.msmodelutl import MSModelUtil +from modelseedpy.core.msfba import MSFBA +from modelseedpy.core.msatpcorrection import MSATPCorrection + +# from builtins import None + +logger = logging.getLogger(__name__) +logger.setLevel( + logging.INFO +) # When debugging - set this to INFO then change needed messages below from DEBUG to INFO + +class MSEnsemble: + @staticmethod + def from_models(models): + #Converting models to MSModelUtil + if not isinstance(model_or_mdlutl, MSModelUtil): + for (i,mdl) in enumerate(models): + models[i] = MSModelUtil.get(mdl) + #Cloning the first model as a starting point + clone_model = cobra.io.json.from_json(cobra.io.json.to_json(models[0].model)) + clone_mdlutl = MSModelUtil.get(clone_model) + ensemble = MSEnsemble(clone_mdlutl) + ensemble.rebuild_from_models(models) + + def from_annotation(model_or_mdlutl,reaction_probability_hash,sample_count=100): + #Create genome from probabilities + mdl = MSBuilder(genome,template).build(base_model, '0', False, False) + mdl.template = self.gs_template + mdlutl = MSModelUtil.get(mdl) + ensemble = MSEnsemble(mdlutl) + ensemble.build_ensemble(reaction_probability_hash, gpr_level_sampling, sample_count) + + def __init__(self,model_or_mdlutl,reaction_probabilities=None): + # Discerning input is model or mdlutl and setting internal links + if isinstance(model_or_mdlutl, MSModelUtil): + self.model = model_or_mdlutl.model + self.mdlutl = model_or_mdlutl + else: + self.model = model_or_mdlutl + self.mdlutl = MSModelUtil.get(model_or_mdlutl) + attributes = self.mdlutl.get_attributes() + if "ensemble" not in attributes: + self.data = { + "size": 0, + "reactions": {} + } + for rxn in self.model.reactions: + probability = 0 + if rxn.id[0:3] == "bio" or rxn.id[0:3] == "EX_" or rxn.id[0:3] == "DM_" or rxn.id[0:3] == "SK_" or len(rxn.genes) == 0: + probability = 1 + self.data["reactions"][rxn.id] = { + "presence": "", + "gapfilling":"", + "genes": {}, + "probability": probability + } + for gene in rxn.genes: + self.data["reactions"][rxn.id]["genes"][gene.id] = { + "presence": "", + "probability": 0.2 + } + if reaction_probabilities: + self.reset_reaction_probabilities(reaction_probabilities) + logger.warning("Input model is not an ensemble model. You will need to run build_ensemble() to create an ensemble model.") + else: + self.data = attributes["ensemble"] + + def reset_reaction_probabilities(self,reaction_probability_hash,clear_existing=False): + #clear_existing: if true, clear existing probabilities before setting new ones + if clear_existing: + for rxnid in self.data["reactions"]: + self.data["reactions"][rxnid]["probability"] = 0 + for geneid in self.data["reactions"][rxnid]["genes"]: + self.data["reactions"][rxnid]["genes"][geneid]["probability"] = 0 + #Overwriting reaction probabilities from input hash + for rxnid in reaction_probability_hash: + if rxnid in self.model.reactions: + if rxnid not in self.data["reactions"]: + self.data["reactions"][rxnid] = {"probability":0,"presence":"","genes":{}} + if "probability" in reaction_probability_hash[rxnid]: + self.data["reactions"][rxnid]["probability"] = reaction_probability_hash[rxnid]["probability"] + if "genes" in reaction_probability_hash[rxnid]: + for geneid in reaction_probability_hash[rxnid]["genes"]: + self.data["reactions"][rxnid]["genes"][geneid] = {"presence":"","probability":reaction_probability_hash[rxnid]["genes"][geneid]} + + def rebuild_from_models(self,models):#DONE + #Clearing existing data + self.data["ATP_analysis"] = {"core_atp_gapfilling":{},"selected_media":{},"tests":{}} + for rxnid in self.data["reactions"]: + self.data["reactions"][rxnid]["presence"] = "" + self.data["reactions"][rxnid]["gapfilling"] = "" + if "genes" in self.data["reactions"][rxnid]: + for geneid in self.data["reactions"][rxnid]["genes"]: + self.data["reactions"][rxnid]["genes"][geneid]["presence"] = "" + else: + self.data["reactions"][rxnid]["genes"] = {} + #Building presence strings from models + self.data["size"] = len(models) + for (i,mdlutl) in enumerate(models): + attributes = mdlutl.get_attributes() + if "ATP_analysis" in attributes: + if "core_atp_gapfilling" in attributes["ATP_analysis"]: + for media in attributes["ATP_analysis"]["core_atp_gapfilling"]: + if media not in self.data["ATP_analysis"]["core_atp_gapfilling"]: + self.data["ATP_analysis"]["core_atp_gapfilling"][media] = [] + for j in range(i): + self.data["ATP_analysis"]["core_atp_gapfilling"][media].append(None) + self.data["ATP_analysis"]["core_atp_gapfilling"][media].append(attributes["ATP_analysis"]["core_atp_gapfilling"][media]) + + if "selected_media" in attributes["ATP_analysis"]: + for media in attributes["ATP_analysis"]["selected_media"]: + if media not in self.data["ATP_analysis"]["selected_media"]: + self.data["ATP_analysis"]["selected_media"][media] = [] + for j in range(i): + self.data["ATP_analysis"]["selected_media"][media].append(None) + self.data["ATP_analysis"]["selected_media"][media].append(attributes["ATP_analysis"]["selected_media"][media]) + if "tests" in attributes["ATP_analysis"]: + for media in attributes["ATP_analysis"]["tests"]: + if media not in self.data["ATP_analysis"]["tests"]: + self.data["ATP_analysis"]["tests"][media] = {"objective":attributes["ATP_analysis"]["tests"][media]["objective"],"threshold":[]} + for j in range(i): + self.data["ATP_analysis"]["tests"][media]["threshold"].append(None) + self.data["ATP_analysis"]["tests"][media]["threshold"].append(attributes["ATP_analysis"]["tests"][media]["threshold"]) + add_reactions = [] + for rxn in mdlutl.model.reactions: + if rxn.id not in self.mdlutl.model.reactions: + add_reactions.append(rxn) + if rxn.id not in self.data["reactions"]: + self.data["reactions"][rxn.id] = { + "presence":'0' * i, + "genes":{} + } + self.data["reactions"][rxn.id]["presence"] += "1" + for gene in rxn.genes: + if gene.id not in self.data["reactions"][rxn.id]["genes"]: + self.data["reactions"][rxn.id]["genes"][gene.id] = '0' * i + self.data["reactions"][rxn.id]["genes"][gene.id] += "1" + self.mdlutl.model.add_reactions(add_reactions) + #Updating GPR of base model + for rxnid in self.data["reactions"]: + rxn = self.mdlutl.model.reactions.get_by_id(rxnid) + rxn.gene_reaction_rule = " or ".join(self.data["reactions"][rxnid]["genes"].keys()) + #Computing probabilities from presence if missing + for rxnid in self.ensemble_data["reactions"]: + if "probabilty" not in self.ensemble_data["reactions"][rxnid]: + self.ensemble_data["reactions"][rxnid]["probabilty"] = self.ensemble_data["reactions"][rxnid]["presence"].count('1')/len(self.ensemble_data["reactions"][rxnid]["presence"]) + for geneid in self.ensemble_data["reactions"][rxnid]["genes"]: + if "probabilty" not in self.ensemble_data["reactions"][rxnid]["genes"][geneid]: + self.ensemble_data["reactions"][rxnid]["genes"][geneid]["probabilty"] = self.ensemble_data["reactions"][rxnid]["genes"][geneid]["presence"].count('1')/len(self.ensemble_data["reactions"][rxnid]["genes"][geneid]["presence"]) + + def sample_from_probabilities(self,reaction_probabilities=None,from_reaction_probabilities=False,sample_count=1000): + #Overwriting reaction probabilities if provided + if reaction_probabilities: + self.reset_reaction_probabilities(reaction_probabilities) + self.data["size"] = sample_count + #Scrolling through ensemble data with probabilities + for rxnid in self.data["reactions"]: + if "probability" not in self.data["reactions"][rxnid]: + logger.critical("Reaction probability missing for "+rxnid+"!") + return None + if rxnid not in self.mdlutl.model.reactions: + logger.critical("Reaction probability for "+rxnid+" but reaction not in base model!") + return None + rxn = self.mdlutl.model.reactions.get_by_id(rxnid) + #Clearing existing data + self.data["reactions"][rxnid]["presence"] = "" + self.data["reactions"][rxnid]["gapfilling"] = "" + #Loading gene-level data + if "genes" not in self.data["reactions"][rxnid]: + self.data["reactions"][rxnid]["genes"] = {} + for gene in rxn.genes: + if gene.id not in self.data["reactions"][rxnid]["genes"] or "probability" not in self.data["reactions"][rxnid]["genes"][gene.id]: + logger.warning("Reaction "+rxnid+" has gene "+gene.id+" but no associated probability data!") + self.data["reactions"][rxnid]["genes"][gene.id] = {"presence":"","probablity":1} + self.data["reactions"][rxnid]["genes"][gene.id]["presence"] = "" + #Sampling from probabilities + for i in range(sample_count): + for rxnid in self.data["reactions"]: + present = False + if from_reaction_probabilities or len(self.data["reactions"][rxnid]["genes"]) == 0: + if random.uniform(0,1) < self.data["reactions"][rxnid]["probability"]: + present = True + else: + for geneid in self.data["reactions"][rxnid]["genes"]: + if random.uniform(0,1) < self.data["reactions"][rxnid]["genes"][geneid]["probability"]: + present = True + self.data["reactions"][rxnid]["genes"][geneid]["presence"] += "1" + else: + self.data["reactions"][rxnid]["genes"][geneid]["presence"] += "0" + if present: + self.data["reactions"][rxnid]["presence"] += "1" + else: + self.data["reactions"][rxnid]["presence"] += "0" + #Updating reaction probabilities from presence data + count = 0 + for item in self.data["reactions"][rxnid]["presence"]: + if item == "1": + count += 1 + self.data["reactions"][rxnid]["probability"] = count/len(self.data["reactions"][rxnid]["presence"]) + #Saving ensemble data in model attributes + return self.save_ensemble_model() + + def unpack_models(self,model_list=None): + output_models = [None]*self.data["size"] + for i in range(10):#self.data["size"]): + if not model_list or i in model_list: + clone_mdl = cobra.io.json.from_json(cobra.io.json.to_json(self.model)) + clone_mdl_utl = MSModelUtil.get(clone_mdl) + remove_reactions = [] + for rxn in clone_mdl_utl.model.reactions: + if rxn.id in self.data["reactions"]: + if self.data["reactions"][rxn.id]["presence"][i] == "0": + remove_reactions.append(rxn) + else: + new_genes = [] + for gene in rxn.genes: + if gene.id in self.data["reactions"][rxn.id]["genes"]: + if self.data["reactions"][rxn.id]["genes"][gene.id]["presence"][i] == "1": + new_genes.append(gene) + rxn.gene_reaction_rule = " or ".join([gene.id for gene in new_genes]) + else: + logger.warning("Ensemble model contains reaction not included in ensemble data. Removing reaction "+rxn.id+" from ensemble model.") + remove_reactions.append(rxn) + clone_mdl.remove_reactions(remove_reactions) + if "ATP_analysis" in self.data: + attributes = clone_mdl_utl.get_attributes() + attributes["ATP_analysis"] = {"core_atp_gapfilling":{},"selected_media":{},"tests":{}} + for media in self.data["ATP_analysis"]["core_atp_gapfilling"]: + if self.data["ATP_analysis"]["core_atp_gapfilling"][media][i] != None: + attributes["ATP_analysis"]["core_atp_gapfilling"][media] = self.data["ATP_analysis"]["core_atp_gapfilling"][media][i] + for media in self.data["ATP_analysis"]["selected_media"]: + if self.data["ATP_analysis"]["selected_media"][media][i] != None: + attributes["ATP_analysis"]["selected_media"][media] = self.data["ATP_analysis"]["selected_media"][media][i] + for media in self.data["ATP_analysis"]["tests"]: + if self.data["ATP_analysis"]["tests"][media]["threshold"][i] != None: + attributes["ATP_analysis"]["tests"][media] = { + "objective":self.data["ATP_analysis"]["tests"][media]["objective"], + "threshold":self.data["ATP_analysis"]["tests"]["threshold"][media][i] + } + clone_mdl_utl.save_attributes(attributes) + output_models[i] = clone_mdl_utl + return output_models + + def save_ensemble_model(self): + self.mdlutl.save_attributes(self.data,"ensemble") + return self.mdlutl + + def run_fba(self,media,objective,maximize,gene_ko=[],reaction_ko=[],pfba=True,fva=True): + msfba = MSFBA(self.model,media,objective,maximize,gene_ko,reaction_ko,pfba,fva,clone=True) + msfba.run() + models = self.unpack_models() + #Iterating over each model to run FBA on each + for mdlutl in models: + subfba = MSFBA(mdlutl,media,objective,maximize,gene_ko,reaction_ko,pfba,fva,clone=False) + subfba.run() + msfba.add_secondary_solution(subfba.primary_solution,subfba.fva_results) + return msfba + + def run_atp_method( + self, + core_template=None, + atp_medias=[], + compartment="c0", + max_gapfilling=10, + gapfilling_delta=0, + atp_hydrolysis_id=None, + load_default_medias=True, + forced_media=[], + default_media_path=None, + ): + models = self.unpack_models() + for mdlutl in models: + atpcorrection = MSATPCorrection( + core_template, + atp_medias, + compartment, + max_gapfilling, + gapfilling_delta, + atp_hydrolysis_id, + load_default_medias, + forced_media, + default_media_path + ) + tests = atpcorrection.run_atp_correction() + self.rebuild_from_models(models) + + def run_gapfilling(self): + pass \ No newline at end of file diff --git a/modelseedpy/core/msfba.py b/modelseedpy/core/msfba.py new file mode 100644 index 00000000..2a86f8ee --- /dev/null +++ b/modelseedpy/core/msfba.py @@ -0,0 +1,230 @@ +# -*- coding: utf-8 -*- +import logging +import re +import traceback +import cobra +from cobra.flux_analysis import pfba +from cobra.flux_analysis import flux_variability_analysis +from modelseedpy.core.msmodelutl import MSModelUtil + +logger = logging.getLogger(__name__) + +class MSFBA: + def __init__(self,model_or_mdlutl,media,objective_reactions={"bio1":1},maximize=True,gene_ko=[],reaction_ko=[],pfba=True,fva=True,clone=True,primary_solution=None,id=None): + if isinstance(model_or_mdlutl, MSModelUtil): + model_or_mdlutl = model_or_mdlutl.model + if clone: + model_or_mdlutl = cobra.io.json.from_json(cobra.io.json.to_json(model_or_mdlutl)) + self.model = model_or_mdlutl + self.mdlutl = MSModelUtil.get(model_or_mdlutl) + self.media = media + self.objective_reactions = objective_reactions + self.maximize = maximize + self.gene_ko = gene_ko + self.reaction_ko = reaction_ko + self.pkgmgr = self.mdlutl.pkgmgr + self.apply_parameters() + self.primary_solution = primary_solution + self.secondary_solutions = None + self.fva = fva + self.pfba = pfba + self.fva_results = None + self.secondary_fva = None + if id == None: + id = self.mdlutl.model.id+".fba" + self.id = id + + def build_objective(self): + sense = "max" + if not self.maximize: + sense = "min" + obj = self.model.problem.Objective(0, direction=sense) + objcoef = {} + for rxnid in self.objective_reactions: + if rxnid in self.model.reactions: + rxn = self.model.reactions.get_by_id(rxnid) + objcoef[rxn.forward_variable] = self.objective_reactions[rxnid] + objcoef[rxn.reverse_variable] = -1*self.objective_reactions[rxnid] + else: + logger.warning(f"KO reaction {rxnid} not found in model") + obj.set_linear_coefficients(objcoef) + + def apply_parameters(self): + self.pkgmgr.getpkg("KBaseMediaPkg").build_package(self.media) + for gene in self.gene_ko: + if gene in self.model.genes: + self.model.genes.get_by_id(gene).knock_out() + else: + logger.warning(f"KO gene {gene} not found in model") + for rxn in self.reaction_ko: + if rxn in self.model.reactions: + self.model.reactions.get_by_id(rxn).knock_out() + else: + logger.warning(f"KO reaction {rxn} not found in model") + + def run(self): + if self.pfba: + self.primary_solution = pfba(self.model) + else: + self.primary_solution = self.model.optimize() + if self.fva: + self.fva_results = flux_variability_analysis(self.model) + + def add_secondary_solution(self,solution,fva=None): + if self.secondary_solutions == None: + self.secondary_solutions = [] + self.secondary_solutions.append(solution) + if fva: + if self.secondary_fva == None: + self.secondary_fva = [] + self.secondary_fva.append(fva) + + def get_variable_class(self,variable_min, variable_max): + variable_class = "Unknown" + if variable_min is None or variable_max is None: + return variable_class + if variable_min == 0 and variable_max == 0: + variable_class = "Blocked" + elif variable_min > 0 and variable_max > 0: + variable_class = "Positive" + elif variable_min >= 0 and variable_max > 0: + variable_class = "Positive variable" + elif variable_min < 0 and variable_max < 0: + variable_class = "Negative" + elif variable_min < 0 and variable_max <= 0: + variable_class = "Negative variable" + else: + variable_class = "Variable" + return variable_class + + def generate_kbase_data(self,fbamodel_ref,media_ref): + output = { + "FBABiomassVariables": [], + "FBACompoundBounds": [], + "FBACompoundVariables": [], + "FBAConstraints": [], + "FBADeletionResults": [], + "FBAMetaboliteProductionResults": [], + "FBAMinimalMediaResults": [], + "FBAMinimalReactionsResults": [], + "FBAPromResults": [], + "FBAReactionBounds": [], + "FBAReactionVariables": [], + "FBATintleResults": [], + "MFALog": "", + "PROMKappa": 1, + "QuantitativeOptimizationSolutions": [], + "__VERSION__": 1, + "additionalCpd_refs": [], + "allReversible": 0, + "biomassRemovals": {}, + "biomassflux_objterms": {"bio1": 1}, + "calculateReactionKnockoutSensitivity": 0, + "comboDeletions": 0, + "compoundflux_objterms": {}, + "decomposeReversibleDrainFlux": 0, + "decomposeReversibleFlux": 0, + "defaultMaxDrainFlux": 0, + "defaultMaxFlux": 1000, + "defaultMinDrainFlux": -1000, + "drainfluxUseVariables": 0, + "fbamodel_ref": fbamodel_ref, + "findMinimalMedia": 0, + "fluxMinimization": 1, + "fluxUseVariables": 0, + "fva": 0, + "gapfillingSolutions": [], + "geneKO_refs": [], + "id": self.id, + "inputfiles": {}, + "maximizeActiveReactions": 0, + "maximizeObjective": 1, + "media_list_refs": [], + "media_ref": media_ref, + "minimizeErrorThermodynamicConstraints": 0, + "minimize_reaction_costs": {}, + "minimize_reactions": 0, + "noErrorThermodynamicConstraints": 0, + "numberOfSolutions": 1, + "objectiveConstraintFraction": 0.1, + "objectiveValue": self.primary_solution.objective_value, + "other_objectives": [], + "outputfiles": {}, + "parameters": { + "Auxotrophy metabolite list": "", + "Beachhead metabolite list": "", + "minimum_target_flux": "0.01", + "save phenotype fluxes": "0", + "suboptimal solutions": "1", + }, + "quantitativeOptimization": 0, + "reactionKO_refs": [], + "reactionflux_objterms": {}, + "simpleThermoConstraints": 0, + "thermodynamicConstraints": 0, + "uptakeLimits": {}, + } + + for rxn in self.model.reactions: + flux = 0 + if rxn.id in self.primary_solution.fluxes: + flux = self.primary_solution.fluxes[rxn.id] + min_flux = rxn.lower_bound + max_flux = rxn.upper_bound + if self.fva_results and rxn.id in self.fva_results: + min_flux, max_flux = self.fva_results[rxn.id] + other_mins= [] + other_maxes = [] + other_fluxes = [] + if self.secondary_solutions: + for sol in self.secondary_solutions: + if rxn.id in sol.fluxes: + other_fluxes.append(sol.fluxes[rxn.id]) + else: + other_fluxes.append(0) + if self.secondary_fva: + othermin = rxn.lower_bound + othermax = rxn.upper_bound + for fva in self.secondary_fva: + if rxn.id in fva: + othermin, othermax = fva[rxn.id] + other_mins.append(othermin) + other_maxes.append(othermax) + variable_class = self.get_variable_class(min_flux, max_flux) + variable_data = { + "class": variable_class, + "lowerBound": rxn.lower_bound, + "max": max_flux, + "min": min_flux, + "upperBound": rxn.upper_bound, + "other_max": other_maxes, + "other_min": other_mins, + "other_values": other_fluxes, + "value": flux, + "variableType": "flux" + } + variable_key = "FBAReactionVariables" + if rxn.id.startswith("EX_"): + lower = variable_data["lowerBound"] + variable_data["lowerBound"] = -1 * variable_data["upperBound"] + variable_data["upperBound"] = -1 * lower + lower = variable_data["min"] + variable_data["min"] = -1 * variable_data["max"] + variable_data["max"] = -1 * lower + variable_data["value"] = -1 * variable_data["value"] + variable_data["variableType"] = "drainflux" + variable_data["modelcompound_ref"] = "~/fbamodel/modelcompounds/id/" + rxn.id[3:] + variable_key = "FBACompoundVariables" + elif rxn.id.startswith("bio"): + variable_data["variableType"] = "biomassflux" + variable_data["biomass_ref"] = "~/fbamodel/biomasses/id/" + rxn.id + variable_key = "FBABiomassVariables" + else: + variable_data["modelreaction_ref"] = "~/fbamodel/modelreactions/id/" + rxn.id + variable_data["exp_state"] = "unknown" + variable_data["biomass_dependencies"] = [] + variable_data["coupled_reactions"] = [] + variable_data["expression"] = 0 + variable_data["scaled_exp"] = 0 + output[variable_key].append(variable_data) + return output \ No newline at end of file diff --git a/modelseedpy/core/msfbareport.py b/modelseedpy/core/msfbareport.py new file mode 100644 index 00000000..df5c34bb --- /dev/null +++ b/modelseedpy/core/msfbareport.py @@ -0,0 +1,636 @@ +# -*- coding: utf-8 -*- +import pandas as pd +import logging +import os +import re +import jinja2 +from os.path import dirname +from pandas.io.formats.style import Styler +from modelseedpy.core.msmodelutl import MSModelUtil + +module_path = dirname(os.path.abspath(__file__)) + +logger = logging.getLogger(__name__) +logger.setLevel( + logging.INFO +) # When debugging - set this to INFO then change needed messages below from DEBUG to INFO + + +class MSModelReport: + def __init__(self, model_or_mdlutl): + if isinstance(model_or_mdlutl, MSModelUtil): + self.model = model_or_mdlutl.model + self.modelutl = model_or_mdlutl + else: + self.model = model_or_mdlutl + self.modelutl = MSModelUtil.get(model_or_mdlutl) + + def generate_reports(self, report_path, multi_tab_report_path): + self.build_report(report_path) + self.build_multitab_report(multi_tab_report_path) + + # Helper function to build overview data + def build_overview_data(self): + # Get the number of compartments + number_compartments = len( + set([metabolite.compartment for metabolite in self.model.metabolites]) + ) + + # Extract gapfilling information + core_gapfilling_media = [] + gapfilling_media = [] + gf_sensitivity = self.modelutl.attributes.get("gf_sensitivity", None) + if gf_sensitivity: + for media in gf_sensitivity: + if ( + "bio1" in self.modelutl.attributes["gf_sensitivity"][media] + and "success" + in self.modelutl.attributes["gf_sensitivity"][media]["bio1"] + ): + gapfilling_media.append(media) + if ( + "rxn00062_c0" in self.modelutl.attributes["gf_sensitivity"][media] + and "success" + in self.modelutl.attributes["gf_sensitivity"][media]["rxn00062_c0"] + ): + core_gapfilling_media.append(media) + + # Count the number of gapfills + number_gapfills = len(gapfilling_media) + + # Convert the lists to strings + core_gapfilling_str = ( + "; ".join(core_gapfilling_media) + if core_gapfilling_media + else "No core gapfilling needed." + ) + gapfilling_media_str = ( + "; ".join(gapfilling_media) + if gapfilling_media + else "No genome-scale gapfilling." + ) + + overview = { + "Model ID": self.model.id, + "Full Gapfilling and ATP Analysis Report": "TBD", # You may replace 'TBD' with actual data when available + "Genome Scale Template": self.model.notes.get( + "kbase_template_refs", "Data Not Available" + ), + "Core Gapfilling Media": core_gapfilling_str, + "Gapfilling Media": gapfilling_media_str, + "Source Genome": self.model.notes.get( + "kbase_genome_ref", "Data Not Available" + ), + "Total Number of reactions": self.modelutl.nonexchange_reaction_count(), + "Number compounds": len(self.model.metabolites), + "Number compartments": number_compartments, + "Number biomass": len( + [ + rxn + for rxn in self.model.reactions + if rxn.annotation.get("sbo") == "SBO:0000629" + ] + ), + "Number gapfills": number_gapfills, + } + return overview + + # Helper function for extracting gapfilling data + def extract_gapfilling_data(self, gf_sensitivity): + if gf_sensitivity is None: + return [], {} + + gapfilling_dict = {} + gapfilling_summary = {} + + for media, media_data in gf_sensitivity.items(): + for target, target_data in media_data.items(): + gf_data = target_data.get("success", {}) + if isinstance(gf_data, dict): + for reaction_id, reaction_data in gf_data.items(): + for direction, metabolites in reaction_data.items(): + # If metabolites is None, set to empty string + if metabolites is None: + metabolites = "" + + # Extract both IDs and Names for Gapfilling Sensitivity + sensitivity_ids = [] + sensitivity_names = [] + if isinstance(metabolites, (list, tuple)): + for met_id in metabolites: + sensitivity_ids.append(met_id) + met_name = ( + self.model.metabolites.get_by_id(met_id).name + if met_id in self.model.metabolites + else met_id + ) + sensitivity_names.append(met_name) + else: + metabolites = str(metabolites) + entry = { + "reaction_id": reaction_id, + "reaction_name": self.model.reactions.get_by_id( + reaction_id + ).name + if reaction_id in self.model.reactions + else reaction_id, + "media": media, + "direction": direction, + "target": target, + "gapfilling_sensitivity_id": "; ".join(sensitivity_ids) + if sensitivity_ids + else metabolites, + "gapfilling_sensitivity_name": "; ".join( + sensitivity_names + ) + if sensitivity_names + else metabolites, + } + + # Update the summary dictionary + if reaction_id not in gapfilling_summary: + gapfilling_summary[reaction_id] = [] + gapfilling_summary[reaction_id].append( + f"{media}: {direction}" + ) + + # Check if reaction_id is already in dictionary + if reaction_id in gapfilling_dict: + # Update the media + existing_entry = gapfilling_dict[reaction_id] + existing_media = existing_entry["media"].split("; ") + if media not in existing_media: + existing_media.append(media) + existing_entry["media"] = "; ".join(existing_media) + else: + gapfilling_dict[reaction_id] = entry + + return list(gapfilling_dict.values()), gapfilling_summary + + # transform data to be used in tabular format to use in build_model_report + def transform_gapfilling_data(self, gapfilling_data): + transformed_data = [] + for entry in gapfilling_data: + row = [ + entry["reaction_id"], + entry["reaction_name"], + entry["media"], + entry["direction"], + entry["target"], + entry["gapfilling_sensitivity_id"], + entry["gapfilling_sensitivity_name"], + ] + transformed_data.append(row) + return transformed_data + + # Extract ATP analysis data + def extract_atp_analysis_data(self, atp_analysis, atp_expansion_filter): + entries = [] + if atp_analysis and "core_atp_gapfilling" in atp_analysis: + for media, data in atp_analysis["core_atp_gapfilling"].items(): + score = data.get("score", None) + new_reactions = [ + "{}: {}".format(k, v) for k, v in data.get("new", {}).items() + ] + reversed_reactions = [ + "{}: {}".format(k, v) for k, v in data.get("reversed", {}).items() + ] + atp_production = "Not integrated" + if ( + "selected_media" in atp_analysis + and media in atp_analysis["selected_media"] + ): + atp_production = atp_analysis["selected_media"][media] + + # Extracting the "Filtered Reactions" in the required format + filtered_reactions = [] + for k, v in atp_expansion_filter.get(media, {}).items(): + if isinstance(v, dict): + for sub_k, sub_v in v.items(): + if isinstance(sub_v, dict): + for reaction, direction_dict in sub_v.items(): + direction = list(direction_dict.keys())[0] + filtered_reactions.append( + f"{reaction}: {direction}" + ) + filtered_reactions_str = "; ".join(filtered_reactions) + + if score is not None: + entries.append( + { + "media": media, + "no_of_gapfilled_reactions": score, + "atp_production": atp_production, + "gapfilled_reactions": "; ".join(new_reactions), + "reversed_reaction_by_gapfilling": "; ".join( + reversed_reactions + ), + "filtered_reactions": filtered_reactions_str, + } + ) + # Sorting the entries based on the 'no_of_gapfilled_reactions' column + entries.sort(key=lambda x: x["no_of_gapfilled_reactions"]) + return entries + + # Extract ATP production data for the ATP Analysis tab + def extract_atp_production_data(self, atp_analysis): + atp_production_dict = {} + if atp_analysis: + selected_media = atp_analysis.get("selected_media", {}) + core_atp_gapfilling = atp_analysis.get("core_atp_gapfilling", {}) + + # First, process selected_media + for media, value in selected_media.items(): + atp_production_dict[media] = round(value, 2) + + # Next, process core_atp_gapfilling for media not in selected_media + for media, data in core_atp_gapfilling.items(): + if media not in atp_production_dict: + if data.get("failed"): + atp_production_dict[media] = "failed" + else: + # If the media was not processed in selected_media and it's not failed, set as 'Not Integrated' + atp_production_dict[media] = "Not Integrated" + + return atp_production_dict + + def build_multitab_report(self, output_path): + + # Build overview data + overview_data = self.build_overview_data() + + # Get gf_sensitivity attribute from the model + gf_sensitivity = self.modelutl.attributes.get("gf_sensitivity", None) + + # Extract gapfilling data + gapfilling_entries, gapfilling_reaction_summary = self.extract_gapfilling_data( + gf_sensitivity + ) + + # Check if ATP_analysis attribute is present in the model + atp_analysis = self.modelutl.attributes.get("ATP_analysis", None) + if atp_analysis: + atp_expansion_filter = self.modelutl.attributes.get( + "atp_expansion_filter", {} + ) + atp_analysis_entries = self.extract_atp_analysis_data( + atp_analysis, atp_expansion_filter + ) + else: + atp_analysis_entries = [] + + # Initialize context dictionary + context = { + "overview": overview_data, + "reactions": [], + "compounds": [], + "genes": [], + "biomass": [], + "gapfilling": gapfilling_entries, # Populated with gapfilling data + "atpanalysis": atp_analysis_entries, # Populated with ATP analysis data + } + + print("Module Path:", module_path + "/../data/") + + exchanges = {r.id for r in self.model.exchanges} + + # Identify biomass reactions using SBO annotation + biomass_reactions_ids = { + rxn.id + for rxn in self.model.reactions + if rxn.annotation.get("sbo") == "SBO:0000629" + } + + # Reactions Tab + for rxn in self.model.reactions: + if rxn.id not in exchanges and rxn.id not in biomass_reactions_ids: + equation = rxn.build_reaction_string(use_metabolite_names=True) + rxn_data = { + "id": rxn.id, + "name": rxn.name, + "equation": equation, + "genes": rxn.gene_reaction_rule, + "gapfilling": "; ".join( + gapfilling_reaction_summary.get(rxn.id, []) + ), # Empty list results in an empty string + } + context["reactions"].append(rxn_data) + + # Compounds Tab + for cpd in self.model.metabolites: + cpd_data = { + "id": cpd.id, + "name": cpd.name, + "formula": cpd.formula, + "charge": cpd.charge, + "compartment": cpd.compartment, + } + context["compounds"].append(cpd_data) + + # Genes Tab + for gene in self.model.genes: + gene_data = { + "gene": gene.id, + "reactions": "; ".join([rxn.id for rxn in gene.reactions]), + } + context["genes"].append(gene_data) + + # Biomass Tab + if biomass_reactions_ids: + for biomass_rxn_id in biomass_reactions_ids: + biomass_rxn = self.model.reactions.get_by_id(biomass_rxn_id) + for metabolite, coefficient in biomass_rxn.metabolites.items(): + compound_id = metabolite.id + compound_name = metabolite.name.split("_")[0] + compartment = compound_id.split("_")[-1] + + biomass_data = { + "biomass_reaction_id": biomass_rxn.id, + "biomass_compound_id": compound_id, + "name": compound_name, + "coefficient": coefficient, + "compartment": compartment, + } + context["biomass"].append(biomass_data) + else: + print("No biomass reactions found in the model.") + + # Gapfilling Tab + gf_sensitivity = self.modelutl.attributes.get("gf_sensitivity", None) + gapfilling_data = self.extract_gapfilling_data(gf_sensitivity) + context["gapfilling"] = gapfilling_entries + + # Extract ATP Production Data + atp_production_data = self.extract_atp_production_data(atp_analysis) + + # Populate the 'atpanalysis' context with ATP production data + for entry in context["atpanalysis"]: + media = entry["media"] + entry["atp_production"] = atp_production_data.get(media, None) + + # Diagnostics + unique_biomass_rxns = biomass_reactions_ids + print(f"Unique biomass reactions identified: {len(unique_biomass_rxns)}") + print(f"Biomass Reaction IDs: {', '.join(unique_biomass_rxns)}") + + print("\nFirst 2 reactions:") + for rxn in context["reactions"][:2]: + print(rxn) + + print("\nFirst 2 compounds:") + for cpd in context["compounds"][:2]: + print(cpd) + + print("\nFirst 2 genes:") + for gene in context["genes"][:2]: + print(gene) + + print("\nFirst 2 biomass compounds:") + for bm in context["biomass"][:2]: + print(bm) + + print("\nFirst 2 gapfilling entries:") + for gf in context["gapfilling"][:2]: + print(gf) + + print("\nFirst 2 ATP Analysis entries:") + for entry in context["atpanalysis"][:2]: + print(entry) + + # Render with template + env = jinja2.Environment( + loader=jinja2.FileSystemLoader(module_path + "/../data/"), + autoescape=jinja2.select_autoescape(["html", "xml"]), + ) + html = env.get_template("ModelReportTemplate.html").render(context) + directory = dirname(output_path) + os.makedirs(directory, exist_ok=True) + with open(output_path, "w") as f: + f.write(html) + + def build_report(self, output_path): + """Builds model HTML report for the Model Summary table + Parameters + ---------- + model : cobra.Model + Model to use to build the report + """ + + # 1. Utilize the build_overview_data method + model_summary_data = self.build_overview_data() + # Remove the unwanted entry + model_summary_data.pop("Full Gapfilling and ATP Analysis Report", None) + # 2. Transform the dictionary into a list of tuples + model_summary_list = [(key, value) for key, value in model_summary_data.items()] + # 3. Convert to DataFrame + model_summary_df = pd.DataFrame(model_summary_list, columns=["", ""]) + + # Style the DataFrame (as was done previously) + model_summary_df_styled = model_summary_df.style.hide( + axis="index" + ).set_table_styles( + [ + { + "selector": "th", + "props": [ + ("border", "none"), + ("background-color", "white"), + ("font-family", "Oxygen"), + ("font-size", "14px"), + ("line-height", "20px"), + ], + }, + { + "selector": "td", + "props": [ + ("border", "none"), + ("font-family", "Oxygen"), + ("font-size", "14px"), + ("line-height", "20px"), + ], + }, + { + "selector": "tr:nth-child(even)", + "props": [("background-color", "white")], + }, + { + "selector": "tr:nth-child(odd)", + "props": [("background-color", "#f2f2f2")], + }, + ] + ) + + # Fetching the gapfilling sensitivity data + gf_sensitivity = self.modelutl.attributes.get("gf_sensitivity", None) + gapfilling_data = self.extract_gapfilling_data(gf_sensitivity) + gapfilling_list = self.transform_gapfilling_data(gapfilling_data[0]) + + # Convert the gapfilling_list to a DataFrame + gapfillings_analysis_df = pd.DataFrame( + gapfilling_list, + columns=[ + "Reaction ID", + "Reaction Name", + "Media", + "Direction", + "Target", + "Gapfilling Sensitivity ID", + "Gapfilling Sensitivity Name", + ], + ) + + # Apply style to Gapfillings Analysis DataFrame + gapfillings_analysis_df_styled = gapfillings_analysis_df.style.hide( + axis="index" + ).set_table_styles( + [ + { + "selector": "th", + "props": [ + ("border", "none"), + ("background-color", "white"), + ("font-family", "Oxygen"), + ("font-size", "14px"), + ("line-height", "20px"), + ], + }, + { + "selector": "td", + "props": [ + ("border", "none"), + ("font-family", "Oxygen"), + ("font-size", "14px"), + ("line-height", "20px"), + ], + }, + { + "selector": "tr:nth-child(even)", + "props": [("background-color", "white")], + }, + { + "selector": "tr:nth-child(odd)", + "props": [("background-color", "#f2f2f2")], + }, + ] + ) + + # Legend for Gapfillings Analysis + annotations_text_gapfillings = """ +
    +
  • Reaction ID: The identifier of the reaction.
  • +
  • Reaction Name: The name of the reaction.
  • +
  • Media: The media used by gap filling.
  • +
  • Direction: The direction of the reaction. Can be ">" for forward, "<" for reverse, or "=" for both directions.
  • +
  • Target: The reaction selected as the objective function target for the gapfilling optimization problem. Targets here can be the model’s biomass reaction, commonly named “bio1” for models created by this app. + Alternatively, “rxn00062” (ATP Production) reaction is shown for cases where gapfilling was applied to guarantee ATP production in a given media. + When reactions are gapfilled for ATP production, we recommend checking the full Core ATP Analysis in the table below.
  • +
  • Gapfilling Sensitivity ID and Name: Gapfilling is necessary when compounds in the biomass objective function can not be produced by the model. + For each reaction we list the biomass compound(s) that can not be synthesized by the model without gapfilling. + In cases where gap filling fails there are two possible scenarios: + 1) FBF (failed before filtering) : the gapfilling immediately failed, even before we filtered out the ATP breaking reactions. This means this objective CANNOT be satisfied with the entire current database. + 2) FAF (failed after filtering): the gapfilling succeeded before filtering, but failed after filtering out reactions that break ATP. This tells you definitively if the ATP filtering caused the gapfilling to fail
  • +
+ """ + + # Extract ATP analysis data + atp_analysis = self.modelutl.attributes.get("ATP_analysis", None) + atp_expansion_filter = self.modelutl.attributes.get("atp_expansion_filter", {}) + atp_analysis_entries = self.extract_atp_analysis_data( + atp_analysis, atp_expansion_filter + ) + + # Convert the atp_analysis_entries list to a DataFrame + atp_analysis_df = pd.DataFrame(atp_analysis_entries) + + # Apply style to ATP Analysis DataFrame + atp_analysis_df_styled = atp_analysis_df.style.hide( + axis="index" + ).set_table_styles( + [ + { + "selector": "th", + "props": [ + ("border", "none"), + ("background-color", "white"), + ("font-family", "Oxygen"), + ("font-size", "14px"), + ("line-height", "20px"), + ], + }, + { + "selector": "td", + "props": [ + ("border", "none"), + ("font-family", "Oxygen"), + ("font-size", "14px"), + ("line-height", "20px"), + ], + }, + { + "selector": "tr:nth-child(even)", + "props": [("background-color", "white")], + }, + { + "selector": "tr:nth-child(odd)", + "props": [("background-color", "#f2f2f2")], + }, + ] + ) + + # Legend for ATP Analysis + annotations_text_atp_analysis = """ +
    +
  • No. of gapfilled reactions: The number of reactions filled by the gapfilling process.
  • +
  • Media: The media in which the reaction takes place.
  • +
  • ATP Production: ATP production by the core metabolism model.
  • +
  • Gapfilled Reactions: Reactions added during the gapfilling process.
  • +
  • Reversed Reaction by Gapfilling: Reactions that have been reversed during the gapfilling process.
  • +
  • Filtered Reactions: Reactions that have been filtered out during the analysis. When a reaction addition would lead to a large increase in ATP production or an infinite energy loop, we filter that reaction out of the gapfilling database and prevent it from being added to the model.
  • +
+ """ + + # ATP analysis explanation text + explanation_text_atp_analysis = """ +

During model reconstruction, we analyze the genome’s core metabolism draft model (model without gapfilling) to assess energy biosynthesis capabilities. + The goal of this analysis is to ensure the core metabolism model is able to produce ATP before we expand the model to the genome-scale. + This step is designed to prevent gapfilling from introducing reactions that create energy-generating loops. + The tests are conducted on a large collection of minimal conditions, with the goal of simulating the model’s capability to produce energy with different electron donor, electron acceptor, and carbon source combinations.

+

When the draft model of the core metabolism is capable of producing ATP in at least one of the test media, no gapfilling reactions part of this analysis will be added to the model. While we still report the gapfilling requirements for the test media formulations that fail to produce ATP with that draft core model, we only integrate these solutions in the model when no test media succeeds in producing ATP. + In this case, the integrated gap-filling solution(s) will be displayed in the “Gapfilling Analysis” table above, with the “Target” “rxn00062” (ATP Production) objective function.

+

The goal is to display the test results for all media to provide clues for the metabolic capabilities of the genome(s). When many reactions are required for growth on the SO4 testing media conditions, this could be a good indicator that the organism is not capable of performing sulfate reduction. + On the other hand, when only one gapfill reaction is required for ATP production in a given media, multiple scenarios can be considered. + 1) Organism(s) can’t grow on test condition, and we correctly did not add the reaction to the model. 2) Possible issue with the source genome annotation missing a specific gene function 3) Possible issue with the model reconstruction database. We hope this data helps make more informed decisions on reactions that may need to be manually curated in the model. + In cases where is known from the literature or unpublished experimental results that an organism is capable of producing ATP in a given media condition that requires gapfilling in this analysis, you can use the parameter “Force ATP media” in the reconstruction app to ensure those reactions are integrated into the model. + .

+ """ + + # Save the data to HTML with the styled DataFrames and the legends + directory = os.path.dirname(output_path) + os.makedirs(directory, exist_ok=True) + with open(output_path, "w", encoding="utf-8") as f: + f.write('') + f.write("

Model Summary

") + f.write(model_summary_df_styled.render(escape=False)) + f.write("

") + f.write("

Gapfillings Analysis

") + + # Check for Gapfillings Analysis data + if not gapfillings_analysis_df.empty: + f.write(gapfillings_analysis_df_styled.render(escape=False)) + f.write(f"

Legend:

{annotations_text_gapfillings}") + else: + f.write( + "

Warning: No Gapfillings Analysis data available for this model.

" + ) + + f.write("

Core ATP Analysis

") + + # Check for ATP Analysis data + if not atp_analysis_df.empty: + f.write(atp_analysis_df_styled.render(escape=False)) + f.write(f"

Legend:

{annotations_text_atp_analysis}") + f.write(explanation_text_atp_analysis) + else: + f.write( + "

Warning: No Core ATP Analysis data available for this model.

" + ) diff --git a/modelseedpy/core/msgapfill.py b/modelseedpy/core/msgapfill.py old mode 100644 new mode 100755 index 6544d74c..480135dc --- a/modelseedpy/core/msgapfill.py +++ b/modelseedpy/core/msgapfill.py @@ -1,66 +1,102 @@ +#!/usr/bin/python # -*- coding: utf-8 -*- import logging -import itertools # !!! the import is never used - -logger = logging.getLogger(__name__) - import cobra import re +import json +import numpy as np +import pandas as pd +import time +from optlang.symbolics import Zero, add from modelseedpy.core import FBAHelper # !!! the import is never used from modelseedpy.fbapkg.mspackagemanager import MSPackageManager from modelseedpy.core.msmodelutl import MSModelUtil -from modelseedpy.fbapkg.gapfillingpkg import default_blacklist from modelseedpy.core.exceptions import GapfillingError +from collections import defaultdict + + +logger = logging.getLogger(__name__) +logger.setLevel( + logging.INFO # WARNING +) # When debugging - set this to INFO then change needed messages below from DEBUG to INFO class MSGapfill: + @staticmethod + def gapfill_count(solution): + total = 0 + if "new" in solution: + total += len(solution["new"]) + if "reversed" in solution: + total += len(solution["reversed"]) + return total + def __init__( self, - model, + model_or_mdlutl, default_gapfill_templates=[], default_gapfill_models=[], test_conditions=[], reaction_scores={}, blacklist=[], + atp_gapfilling=False, + minimum_obj=0.01, + default_excretion=100, + default_uptake=0, + default_target=None, + base_media = None, + base_media_target_element = "C" ): - if isinstance(model, MSModelUtil): - self.model = model.model - self.modelutl = model + # Discerning input is model or mdlutl and setting internal links + if isinstance(model_or_mdlutl, MSModelUtil): + self.model = model_or_mdlutl.model + self.mdlutl = model_or_mdlutl else: - self.model = model - self.modelutl = MSModelUtil(model) + self.model = model_or_mdlutl + self.mdlutl = MSModelUtil.get(model_or_mdlutl) + # Setting gapfilling attribute in model utl so link is bidirectional + if not atp_gapfilling: + self.mdlutl.gfutl = self self.auto_sink = [ + "cpd01042", "cpd02701", "cpd11416", "cpd15302", + "cpd03091", ] # the cpd11416 compound is filtered during model extension with templates - self.gfmodel = self.lp_filename = self.last_solution = None + # Cloning model to create gapfilling model + self.gfmodel = cobra.io.json.from_json(cobra.io.json.to_json(self.model)) + self.gfmodelutl = MSModelUtil.get(self.gfmodel) + # Getting package manager for gapfilling model + self.gfpkgmgr = MSPackageManager.get_pkg_mgr(self.gfmodelutl) + # Setting target from input + if default_target: + self.default_target = default_target + self.gfmodel.objective = self.gfmodel.problem.Objective( + self.gfmodel.reactions.get_by_id(default_target).flux_expression, + direction="max", + ) + # Setting parameters for gapfilling + self.last_solution = None self.model_penalty = 1 + self.default_minimum_objective = minimum_obj self.default_gapfill_models = default_gapfill_models self.default_gapfill_templates = default_gapfill_templates self.gapfill_templates_by_index, self.gapfill_models_by_index = {}, {} self.gapfill_all_indecies_with_default_templates = True self.gapfill_all_indecies_with_default_models = True - self.blacklist = list(set(default_blacklist + blacklist)) + self.blacklist = list(set(blacklist)) self.test_condition_iteration_limit = 10 self.test_conditions = test_conditions self.reaction_scores = reaction_scores - - def run_gapfilling( - self, - media=None, - target=None, - minimum_obj=0.01, - binary_check=False, - prefilter=True, - ): - if target: - self.model.objective = self.model.problem.Objective( - self.model.reactions.get_by_id(target).flux_expression, direction="max" - ) - self.gfmodel = cobra.io.json.from_json(cobra.io.json.to_json(self.model)) - pkgmgr = MSPackageManager.get_pkg_mgr(self.gfmodel) - pkgmgr.getpkg("GapfillingPkg").build_package( + self.default_excretion = default_excretion + self.default_uptake = default_uptake + self.minimum_obj = minimum_obj + self.base_media = base_media + self.base_media_target_element = base_media_target_element + self.cumulative_gapfilling = [] + # Building gapfilling package + self.gfpkgmgr.getpkg("GapfillingPkg").build_package( { "auto_sink": self.auto_sink, "model_penalty": self.model_penalty, @@ -70,84 +106,759 @@ def run_gapfilling( "gapfill_models_by_index": self.gapfill_models_by_index, "gapfill_all_indecies_with_default_templates": self.gapfill_all_indecies_with_default_templates, "gapfill_all_indecies_with_default_models": self.gapfill_all_indecies_with_default_models, - "default_excretion": 100, - "default_uptake": 100, + "default_excretion": default_excretion, + "default_uptake": default_uptake, "minimum_obj": minimum_obj, "blacklist": self.blacklist, "reaction_scores": self.reaction_scores, "set_objective": 1, + "base_media": base_media, + "base_media_target_element":base_media_target_element } ) - pkgmgr.getpkg("KBaseMediaPkg").build_package(media) - # Filtering breaking reactions out of the database - if prefilter and self.test_conditions: - pkgmgr.getpkg("GapfillingPkg").filter_database_based_on_tests( - self.test_conditions + def test_gapfill_database(self, media, target=None, before_filtering=True,active_reactions=[]): + # Testing if gapfilling can work before filtering + if target: + self.gfpkgmgr.getpkg("GapfillingPkg").set_base_objective(target,None) + else: + target = str(self.gfmodel.objective) + target = target.split(" ")[0] + target = target[13:] + #Setting media + self.gfpkgmgr.getpkg("KBaseMediaPkg").build_package(media) + if self.gfpkgmgr.getpkg("GapfillingPkg").test_gapfill_database(active_reactions): + return True + if self.gfpkgmgr.getpkg("GapfillingPkg").test_solution.status == 'infeasible': + return False + gf_sensitivity = {} + if target != "rxn00062_c0": + gf_sensitivity = self.mdlutl.get_attributes("gf_sensitivity", {}) + if media.id not in gf_sensitivity: + gf_sensitivity[media.id] = {} + if target not in gf_sensitivity[media.id]: + gf_sensitivity[media.id][target] = {} + filter_msg = " " + note = "FAF" + if before_filtering: + filter_msg = " before filtering " + note = "FBF" + gf_sensitivity[media.id][target][ + note + ] = self.mdlutl.find_unproducible_biomass_compounds(target) + if target != "rxn00062_c0": + self.mdlutl.save_attributes(gf_sensitivity, "gf_sensitivity") + logger.warning( + "No gapfilling solution found" + + filter_msg + + "for " + + media.id + + " activating " + + target + ) + return False + + def test_and_adjust_gapfilling_conditions(self,medias,targets,thresholds,prefilter=True): + output = { + "medias":[], + "targets":[], + "thresholds":[], + "conditions":[], + "active_reactions":[] + } + logger.debug("Testing unfiltered database") + for i,media in enumerate(medias): + active_reactions = [] + if self.test_gapfill_database(media,targets[i],before_filtering=True,active_reactions=active_reactions): + output["medias"].append(media) + output["targets"].append(targets[i]) + output["thresholds"].append(thresholds[i]) + output["active_reactions"].append(active_reactions) + output["conditions"].append({ + "media": media, + "is_max_threshold": False, + "threshold": thresholds[i], + "objective": targets[i], + }) + # Filtering + if prefilter: + logger.debug("Filtering database") + self.prefilter(growth_conditions=output["conditions"],active_reaction_sets=output["active_reactions"]) + medias = [] + targets = [] + thresholds = [] + conditions = [] + active_reaction_sets = [] + logger.debug("Testing filtered database") + for i,media in enumerate(output["medias"]): + active_reactions = [] + if self.test_gapfill_database(media,output["targets"][i],before_filtering=False,active_reactions=active_reactions): + medias.append(media) + targets.append(output["targets"][i]) + thresholds.append(output["thresholds"][i]) + conditions.append(output["conditions"][i]) + active_reaction_sets.append(active_reactions) + output["medias"] = medias + output["targets"] = targets + output["thresholds"] = thresholds + output["conditions"] = conditions + output["active_reactions"] = active_reaction_sets + return output + + def prefilter(self,test_conditions=None,growth_conditions=[],use_prior_filtering=False,base_filter_only=False,active_reaction_sets=[]): + """Prefilters the database by removing any reactions that break specified ATP tests + Parameters + ---------- + test_conditions : [] + List of conditions to be tested when filtering the gapfilling database. If not specified, the test_conditions attribute will be used + """ + if not test_conditions: + test_conditions = self.test_conditions + if self.test_conditions: + logger.debug(f"PREFILTERING WITH {str(len(growth_conditions))} GROWTH CONDITIONS") + base_filter = None + if use_prior_filtering: + base_filter = self.mdlutl.get_attributes("gf_filter", {}) + self.gfpkgmgr.getpkg("GapfillingPkg").filter_database_based_on_tests( + self.test_conditions, + growth_conditions=growth_conditions, + base_filter=base_filter, + base_filter_only=base_filter_only, + active_reaction_sets=active_reaction_sets + ) + gf_filter = self.gfpkgmgr.getpkg("GapfillingPkg").modelutl.get_attributes( + "gf_filter", {} ) + base_filter = self.mdlutl.get_attributes("gf_filter", {}) + for media_id in gf_filter: + base_filter[media_id] = gf_filter[media_id] + + def run_gapfilling( + self, + media=None, + target=None, + minimum_obj=None, + binary_check=False, + prefilter=True, + ): + """Run gapfilling on a single media condition to force the model to achieve a nonzero specified objective + Parameters + ---------- + media : MSMedia + Media in which the model should be gapfilled + target : string + Name or expression describing the reaction or combination of reactions to the optimized + minimum_obj : double + Value to use for the minimal objective threshold that the model must be gapfilled to achieve + binary_check : bool + Indicates if the solution should be checked to ensure it is minimal in the number of reactions involved + prefilter : bool + Indicates if the gapfilling database should be prefiltered using the tests provided in the MSGapfill constructor before running gapfilling + """ + # Setting target and media if specified + if not target: + target = self.default_target + if not minimum_obj: + minimum_obj = self.default_minimum_objective + self.gfpkgmgr.getpkg("GapfillingPkg").set_base_objective(target,minimum_obj) + if media: + self.gfpkgmgr.getpkg("GapfillingPkg").set_media(media) + + # Testing if gapfilling can work before filtering + if not self.test_gapfill_database(media,target,before_filtering=prefilter): + return None - if self.lp_filename: - with open(self.lp_filename, "w") as out: - out.write(str(self.gfmodel.solver)) + # Filtering + if prefilter: + self.prefilter(growth_conditions=[{ + "media": media, + "is_max_threshold": False, + "threshold": minimum_obj, + "objective": target, + }]) + if not self.test_gapfill_database(media,target,before_filtering=False): + return None + + # Printing the gapfilling LP file + self.mdlutl.printlp(model=self.gfmodel,filename="StandardGapfill",print=False) + + # Running gapfil/ling and checking solution sol = self.gfmodel.optimize() logger.debug( - "gapfill solution objective value %f (%s) for media %s", - sol.objective_value, - sol.status, - media, + f"gapfill solution objective value {sol.objective_value} ({sol.status}) for media {media}" ) - if sol.status != "optimal": - logger.debug("No solution found for %s", media) + logger.warning("No solution found for %s", media) return None - self.last_solution = pkgmgr.getpkg("GapfillingPkg").compute_gapfilled_solution() + # Computing solution and ensuring all tests still pass + self.last_solution = self.gfpkgmgr.getpkg( + "GapfillingPkg" + ).compute_gapfilled_solution() if self.test_conditions: - self.last_solution = pkgmgr.getpkg("GapfillingPkg").run_test_conditions( + self.last_solution = self.gfpkgmgr.getpkg( + "GapfillingPkg" + ).run_test_conditions( self.test_conditions, self.last_solution, self.test_condition_iteration_limit, ) if self.last_solution is None: - logger.debug( - "No solution could be found that satisfied all \ - specified test conditions in specified iterations!" + logger.warning( + "no solution could be found that satisfied all specified test conditions in specified iterations!" ) return None + + # Running binary check to reduce solution to minimal reaction solution if binary_check: - return pkgmgr.getpkg("GapfillingPkg").binary_check_gapfilling_solution() + self.last_solution = self.gfpkgmgr.getpkg( + "GapfillingPkg" + ).binary_check_gapfilling_solution() + + # Setting last solution data + self.last_solution["media"] = media + self.last_solution["target"] = target + self.last_solution["minobjective"] = minimum_obj + self.last_solution["binary_check"] = binary_check return self.last_solution + + def run_global_gapfilling( + self, + medias, + targets, + thresholds, + binary_check=False, + prefilter=True, + ): + """Run gapfilling on a single media condition to force the model to achieve a nonzero specified objective + Parameters + ---------- + medias : [MSMedia] + Media in which the model should be gapfilled + targets : [string] + Name or expression describing the reaction or combination of reactions to the optimized + thresholds : [double] + Value to use for the minimal objective threshold that the model must be gapfilled to achieve + binary_check : bool + Indicates if the solution should be checked to ensure it is minimal in the number of reactions involved + prefilter : bool + Indicates if the gapfilling database should be prefiltered using the tests provided in the MSGapfill constructor before running gapfilling + check_for_growth : bool + Indicates if the model should be checked to ensure that the resulting gapfilling solution produces a nonzero objective + """ + start_time = time.time() + # Testing if gapfilling can work before filtering + test_output = self.test_and_adjust_gapfilling_conditions(medias,targets,thresholds,prefilter=prefilter) + #If none of the media conditions can be gapfilled, then return None + if len(test_output["medias"]) == 0: + return None + #Adding max flux variables + self.gfpkgmgr.getpkg("GapfillingPkg").create_max_flux_variables() + #Instantiating all models to be merged + merged_model = None + model_list = [] + pkgmgrs = {} + for i,media in enumerate(test_output["medias"]): + #Setting the objective + self.gfpkgmgr.getpkg("GapfillingPkg").set_base_objective(test_output["targets"][i],test_output["thresholds"][i]) + #Setting the media + self.gfpkgmgr.getpkg("GapfillingPkg").set_media(media) + #Copying model and either making it the base model or adding to the model list + model_cpy = self.gfmodel.copy() + + if i == 0: + merged_model = model_cpy + else: + model_list.append(model_cpy) + #Merging all models + mergpkgmgr = MSPackageManager.get_pkg_mgr(merged_model) + mergpkgmgr.getpkg("ProblemReplicationPkg").build_package({ + "models":model_list, + "shared_variable_packages":{ + "GapfillingPkg" : ["rmaxf","fmaxf"] + } + }) + mergfpkg = mergpkgmgr.getpkg("GapfillingPkg") + origgfpkg = self.gfpkgmgr.getpkg("GapfillingPkg") + #Setting the objective + reaction_objective = merged_model.problem.Objective(Zero, direction="min") + obj_coef = dict() + gfrxnidhash = dict() + for rxnid in mergfpkg.variables["rmaxf"]: + gfrxnidhash[rxnid] = {"reverse":mergfpkg.variables["rmaxf"][rxnid]} + if rxnid in origgfpkg.gapfilling_penalties: + if "reverse" in origgfpkg.gapfilling_penalties[rxnid]: + obj_coef[mergfpkg.variables["rmaxf"][rxnid]] = abs(origgfpkg.gapfilling_penalties[rxnid]["reverse"]) + else: + obj_coef[mergfpkg.variables["rmaxf"][rxnid]] = 1 + else: + obj_coef[mergfpkg.variables["rmaxf"][rxnid]] = 1 + for rxnid in mergfpkg.variables["fmaxf"]: + if rxnid not in gfrxnidhash: + gfrxnidhash[rxnid] = {"forward":mergfpkg.variables["fmaxf"][rxnid]} + else: + gfrxnidhash[rxnid]["forward"] = mergfpkg.variables["fmaxf"][rxnid] + if rxnid in origgfpkg.gapfilling_penalties: + if "forward" in origgfpkg.gapfilling_penalties[rxnid]: + obj_coef[mergfpkg.variables["fmaxf"][rxnid]] = abs(origgfpkg.gapfilling_penalties[rxnid]["forward"]) + else: + obj_coef[mergfpkg.variables["fmaxf"][rxnid]] = 1 + else: + obj_coef[mergfpkg.variables["fmaxf"][rxnid]] = 1 + merged_model.objective = reaction_objective + reaction_objective.set_linear_coefficients(obj_coef) + # Printing the gapfilling LP file + self.mdlutl.printlp(model=merged_model,filename="GlobalGapfill",print=True) + + # Running gapfilling and checking solution + print("Starting global optimization-",time.time()-start_time) + sol = merged_model.optimize() + print("Global optimization complete-",time.time()-start_time) + logger.info( + f"gapfill solution objective value {sol.objective_value} ({sol.status}) for media {media}" + ) + if sol.status != "optimal": + logger.warning("No solution found for %s", media) + return None + + # Computing solution and ensuring all tests still pass + self.last_solution = {"new":{},"reversed":{},"media":test_output["medias"][0],"target":test_output["targets"][0],"minobjective":test_output["thresholds"][0],"binary_check":False} + flux_values = {} + for rxnid in origgfpkg.gapfilling_penalties: + flux_values[rxnid] = {} + flux_values[rxnid]["reverse"] = merged_model.reactions.get_by_id(rxnid).reverse_variable.primal + flux_values[rxnid]["forward"] = merged_model.reactions.get_by_id(rxnid).forward_variable.primal + for rxnid in gfrxnidhash: + if rxnid not in flux_values: + flux_values[rxnid] = {} + penalty = 0 + if "reverse" in gfrxnidhash[rxnid]: + if rxnid in origgfpkg.gapfilling_penalties and "reverse" in origgfpkg.gapfilling_penalties[rxnid]: + penalty = origgfpkg.gapfilling_penalties[rxnid]["reverse"] + if gfrxnidhash[rxnid]["reverse"].primal > 1e-8: + logger.debug(f"{rxnid} reverse {gfrxnidhash[rxnid]['reverse'].primal} {penalty}") + flux_values[rxnid]["reverse"] = gfrxnidhash[rxnid]["reverse"].primal + penalty = 0 + if "forward" in gfrxnidhash[rxnid]: + if rxnid in origgfpkg.gapfilling_penalties and "forward" in origgfpkg.gapfilling_penalties[rxnid]: + penalty = origgfpkg.gapfilling_penalties[rxnid]["forward"] + if gfrxnidhash[rxnid]["forward"].primal > 1e-8: + logger.debug(f"{rxnid} forward {gfrxnidhash[rxnid]['forward'].primal} {penalty}") + flux_values[rxnid]["forward"] = gfrxnidhash[rxnid]["forward"].primal + global_solution = origgfpkg.compute_gapfilled_solution(flux_values) + logger.info(f"Gloabl solution: {global_solution}") + print("Global gapfilling done -",time.time()-start_time) + return global_solution + + def run_multi_gapfill( + self, + media_list, + target=None, + target_hash={}, + minimum_objectives={}, + default_minimum_objective=None, + binary_check=False, + prefilter=True, + check_for_growth=True, + gapfilling_mode="Sequential", + run_sensitivity_analysis=True, + integrate_solutions=True, + remove_unneeded_reactions=True + ): + """Run gapfilling across an array of media conditions ultimately using different integration policies: simultaneous gapfilling, independent gapfilling, cumulative gapfilling + Parameters + ---------- + media_list : [MSMedia] + List of the medias in which the model should be gapfilled + target : string + Name or expression describing the reaction or combination of reactions to the optimized + minimum_objectives : {string - media ID : double - minimum objective value} + Media-specific minimal objective thresholds that the model must be gapfilled to achieve + default_minimum_objective : double + Default value to use for the minimal objective threshold that the model must be gapfilled to achieve + binary_check : bool + Indicates if the solution should be checked to ensure it is minimal in the number of reactions involved + prefilter : bool + Indicates if the gapfilling database should be prefiltered using the tests provided in the MSGapfill constructor before running gapfilling + check_for_growth : bool + Indicates if the model should be checked to ensure that the resulting gapfilling solution produces a nonzero objective + gapfilling_mode : string + Indicates the integration policy to be used: Global, Independent, and Cumulative + run_sensitivity_analysis : bool + Indicates if sensitivity analysis should be run on the gapfilling solution to determine biomass dependency + """ + #If not integrating, backing up and replacing self.mdlutl + oldmdlutl = self.mdlutl + if not integrate_solutions: + self.model = cobra.io.json.from_json(cobra.io.json.to_json(self.model)) + self.mdlutl = MSModelUtil.get(self.model) + #Setting the default minimum objective + if default_minimum_objective == None: + default_minimum_objective = self.default_minimum_objective + self.gfpkgmgr.getpkg("GapfillingPkg").parameters["minimum_obj"] = default_minimum_objective + # Testing if gapfilling can work before and after filtering + targets = [] + thresholds = [] + for media in media_list: + currtarget = target + if media in target_hash: + currtarget = target_hash[media] + targets.append(currtarget) + minimum_obj = default_minimum_objective + if media in minimum_objectives: + minimum_obj = minimum_objectives[media] + thresholds.append(minimum_obj) + test_output = self.test_and_adjust_gapfilling_conditions(media_list,targets,thresholds,prefilter=prefilter) + #If there are no media left, don't run gapfilling + if len(test_output["medias"]) == 0: + return None + #Iterating over all media and running gapfilling + solution_dictionary = {} + cumulative_solution = [] + for i,media in enumerate(test_output["medias"]): + #Implementing specified gapfilling mode + if gapfilling_mode == "Independent" or gapfilling_mode == "Sequential": + print("Running "+gapfilling_mode+" gapfilling!") + solution = self.run_gapfilling( + media, + test_output["targets"][i], + test_output["thresholds"][i], + binary_check, + False, + ) + #If there is a solution, go ahead and integrate it into the model + if solution: + solution_dictionary[media] = self.integrate_gapfill_solution( + solution, + cumulative_solution=cumulative_solution, + remove_unneeded_reactions=remove_unneeded_reactions, + check_for_growth=check_for_growth, + gapfilling_mode=gapfilling_mode + ) + #If we are doing cumulative gapfilling, then we need adjust the gapfilling objective so it no longer penalizes using the current solution reactions + if gapfilling_mode == "Sequential": + self.gfpkgmgr.getpkg("GapfillingPkg").compute_gapfilling_penalties(exclusion_solution=cumulative_solution,reaction_scores=self.reaction_scores) + self.gfpkgmgr.getpkg("GapfillingPkg").build_gapfilling_objective_function() + if gapfilling_mode == "Global": + #Now we run simultaneous gapfilling on a combination of all our various gapfilled models + print("Running global gapfilling!") + full_solution = self.run_global_gapfilling( + medias=test_output["medias"], + targets=test_output["targets"], + thresholds=test_output["thresholds"], + binary_check=binary_check, + prefilter=False + ) + #Now we integrate the full solution into the model for every media which effectively determines which reactions are needed for each media + for i,item in enumerate(test_output["medias"]): + copy_solution = full_solution.copy() + copy_solution["media"] = item + copy_solution["target"] = test_output["targets"][i] + copy_solution["minobjective"] = test_output["thresholds"][i] + copy_solution["binary_check"] = binary_check + #In this case we donot remove unnneeded reactions from the model because they may be needed for other media + solution_dictionary[item] = self.integrate_gapfill_solution( + copy_solution, + cumulative_solution=cumulative_solution, + remove_unneeded_reactions=False, + check_for_growth=check_for_growth, + gapfilling_mode=gapfilling_mode + ) + #Now we remove reactions uneeded for any of the specified media conditions + #These is a danger here that the integration step will put a reaction into a solution that subsequently gets removed at this step. This is something to look out for + unneeded = self.mdlutl.test_solution( + cumulative_solution, + test_output["targets"], + test_output["medias"], + thresholds=test_output["thresholds"], + remove_unneeded_reactions=True, + do_not_remove_list=[] + )#Returns reactions in cumulative solution that are not needed for growth + print("Unneeded in global gapfill:",unneeded) + elif gapfilling_mode == "Sequential": + #Restoring the gapfilling objective function + self.gfpkgmgr.getpkg("GapfillingPkg").compute_gapfilling_penalties(reaction_scores=self.reaction_scores) + self.gfpkgmgr.getpkg("GapfillingPkg").build_gapfilling_objective_function() + #Running sensitivity analysis once on the cumulative solution for all media + #with open("datacache/solutions.json", 'w') as f: + #json.dump(solution_dictionary,f,indent=4,skipkeys=True) + if run_sensitivity_analysis: + logger.info( + "Gapfilling sensitivity analysis running" + ) + #First aggregating all unique reactions with a media for each + reaction_media_hash = {} + solution_rxn_types = ["new","reversed"] + media_reaction_hash = {} + for media in solution_dictionary: + if solution_dictionary[media]["growth"] > 0: + for rxn_type in solution_rxn_types: + for rxn_id in solution_dictionary[media][rxn_type]: + if rxn_id not in reaction_media_hash: + reaction_media_hash[rxn_id] = {} + if solution_dictionary[media][rxn_type][rxn_id] not in reaction_media_hash[rxn_id]: + reaction_media_hash[rxn_id][solution_dictionary[media][rxn_type][rxn_id]] = media + if media not in media_reaction_hash: + media_reaction_hash[media] = {} + media_reaction_hash[media][rxn_id] = solution_dictionary[media][rxn_type][rxn_id] + #Running sensitivity analysis on minimal reactions in each media + rxn_sensitivity_hash = {} + for media in media_reaction_hash: + test_solution = [] + for rxn in media_reaction_hash[media]: + test_solution.append([rxn, media_reaction_hash[media][rxn]]) + self.mdlutl.pkgmgr.getpkg("KBaseMediaPkg").build_package(media) + sensitivity_results = self.mdlutl.find_unproducible_biomass_compounds( + target, test_solution + ) + for rxn in sensitivity_results: + if rxn not in rxn_sensitivity_hash: + rxn_sensitivity_hash[rxn] = {} + for dir in sensitivity_results[rxn]: + rxn_sensitivity_hash[rxn][dir] = sensitivity_results[rxn][dir] + #Building gapfilling sensitivity output + gf_sensitivity = self.mdlutl.get_attributes("gf_sensitivity", {}) + for media in solution_dictionary: + if media.id not in gf_sensitivity: + gf_sensitivity[media.id] = {} + if target not in gf_sensitivity[media.id]: + gf_sensitivity[media.id][target] = {} + if solution_dictionary[media]["growth"] > 0: + gf_sensitivity[media.id][target]["success"] = {} + for rxn_type in solution_rxn_types: + for rxn_id in solution_dictionary[media][rxn_type]: + if rxn_id not in gf_sensitivity[media.id][target]["success"]: + gf_sensitivity[media.id][target]["success"][rxn_id] = {} + gf_sensitivity[media.id][target]["success"][rxn_id][solution_dictionary[media][rxn_type][rxn_id]] = rxn_sensitivity_hash[rxn_id][solution_dictionary[media][rxn_type][rxn_id]] + else: + gf_sensitivity[media.id][target]["failure"] = {} + self.mdlutl.save_attributes(gf_sensitivity, "gf_sensitivity") + #Restoring backedup model + self.mdlutl = oldmdlutl + self.model = oldmdlutl.model + #Returning the solution dictionary + return solution_dictionary - def integrate_gapfill_solution(self, solution): - for rxn_id in solution["reversed"]: - rxn = self.model.reactions.get_by_id(rxn_id) - if solution["reversed"][rxn_id] == ">": + def integrate_gapfill_solution( + self,solution,cumulative_solution=[],remove_unneeded_reactions=False,check_for_growth=True,gapfilling_mode="Sequential" + ): + """Integrating gapfilling solution into model + Parameters + ---------- + solution : dict + Specifies the reactions to be added to the model to implement the gapfilling solution + cumulative_solution : list + Optional array to cumulatively track all reactions added to the model when integrating multiple solutions + remove_unneeded_reactions : bool + Indicate where unneeded reactions should be removed from the model + check_for_growth : bool + Indicate if the model should be checked to ensure that the resulting gapfilling solution produces a nonzero objective + gapfilling_mode : Cumulative, Independent, Simultaneous + Specify what the gapfilling mode is because this determines how integration is performed + """ + logger.debug(f"Initial solution: {str(solution)}") + original_objective = self.mdlutl.model.objective + self.mdlutl.model.objective = solution["target"] + self.mdlutl.model.objective.direction = "max" + #If gapfilling mode is independent, we should remove the cumulative solution from the model before integrating the current solution + if gapfilling_mode == "Independent": + for item in cumulative_solution: + rxn = self.model.reactions.get_by_id(item[0]) + if item[1] == ">": + rxn.upper_bound = 0 + else: + rxn.lower_bound = 0 + new_cumulative_reactions = [] + #Converting the solution to list + list_solution = self.mdlutl.convert_solution_to_list(solution) + for item in list_solution: + if item[0] not in self.model.reactions: + logger.debug(f"adding reaction: {str(item[0])}") + #Copying and adding the reaction to the model + rxn = self.gfmodel.reactions.get_by_id(item[0]) + rxn = rxn.copy() + self.model.add_reactions([rxn]) + #Clearing current bounds because we only want to add reaction in the direction it was gapfilled in + rxn.upper_bound = 0 + rxn.lower_bound = 0 + logger.info(f"integrating rxn: {item[0]}") + rxn = self.model.reactions.get_by_id(item[0]) + #Setting genes if the reaction has no genes + if len(rxn.genes) == 0: + #Setting genes from reaction scores in we have them + coreid = re.sub(r"_[a-z]\d+$", "", item[0]) + if coreid in self.reaction_scores: + logger.info(f"Found reaction scores for coreid: {coreid}") + bestgene = None + bestscore = None + for gene in self.reaction_scores[coreid]: + score = None + if isinstance(self.reaction_scores[coreid][gene], dict): + score = self.reaction_scores[coreid][gene]["probability"] + else: + score = self.reaction_scores[coreid][gene] + if ( + not bestgene + or score + > bestscore + ): + bestgene = gene + bestscore = score + rxn = self.model.reactions.get_by_id(item[0]) + logger.info(f"Assigning gene to reaction: {item[0]} {bestgene}") + rxn.gene_reaction_rule = bestgene + rxn.notes["new_genes"] = bestgene + print("Assigning gene to reaction: "+item[0]+" "+bestgene) + #Setting bounds according to the direction the reaction was gapfilled in + if item[1] == ">": rxn.upper_bound = 100 else: rxn.lower_bound = -100 - for rxn_id in solution["new"]: - rxn = self.gfmodel.reactions.get_by_id(rxn_id) - rxn = rxn.copy() - self.model.add_reactions([rxn]) - coreid = re.sub(r"_[a-z]\d+$", "", rxn_id) - if coreid in self.reaction_scores: - bestgene = None - for gene in self.reaction_scores[coreid]: - if ( - not bestgene - or self.reaction_scores[coreid][gene] - > self.reaction_scores[coreid][bestgene] - ): - bestgene = gene - rxn = self.model.reactions.get_by_id(rxn_id) - rxn.gene_reaction_rule = bestgene - if solution["new"][rxn_id] == ">": - rxn.upper_bound = 100 - rxn.lower_bound = 0 + #Adding reaction to cumulative solution if it is not already there + if not self.mdlutl.find_item_in_solution(cumulative_solution,item): + new_cumulative_reactions.append([item[0], item[1],item[2]]) + #Testing the full cumulative solution to see which reactions are needed for current media/target + full_solution = cumulative_solution + new_cumulative_reactions + logger.debug(f"Full solution: {str(full_solution)}") + #Setting up structure to store the finalized solution for this media/target + current_media_target_solution = {"growth":0,"media":solution["media"],"target":solution["target"],"minobjective":solution["minobjective"],"binary_check":solution["binary_check"] ,"new":{},"reversed":{}} + #If gapfilling is independent, we only check the specific solution + if gapfilling_mode == "Independent": + unneeded = self.mdlutl.test_solution(list_solution,[solution["target"]],[solution["media"]],[solution["minobjective"]],remove_unneeded_reactions,do_not_remove_list=cumulative_solution)#Returns reactions in input solution that are not needed for growth + for item in list_solution: + if not self.mdlutl.find_item_in_solution(unneeded,item): + current_media_target_solution[item[2]][item[0]] = item[1] + if not self.mdlutl.find_item_in_solution(cumulative_solution,item): + cumulative_solution.append(item) + #elif not remove_unneeded_reactions and not self.mdlutl.find_item_in_solution(cumulative_solution,item): + # cumulative_solution.append(item) + logger.info(f"Cumulative media target solution: {str(current_media_target_solution)}") + else: + unneeded = self.mdlutl.test_solution(full_solution,[solution["target"]],[solution["media"]],[solution["minobjective"]],remove_unneeded_reactions,do_not_remove_list=cumulative_solution)#Returns reactions in input solution that are not needed for growth + for item in cumulative_solution: + if not self.mdlutl.find_item_in_solution(unneeded,item): + current_media_target_solution[item[2]][item[0]] = item[1] + for item in new_cumulative_reactions: + if not self.mdlutl.find_item_in_solution(unneeded,item): + current_media_target_solution[item[2]][item[0]] = item[1] + cumulative_solution.append(item) + #elif not remove_unneeded_reactions: + # cumulative_solution.append(item) + logger.debug(f"Unneeded: {str(unneeded)}") + logger.debug(f"Cumulative: {str(self.cumulative_gapfilling)}") + #Checking that the final integrated model grows + if check_for_growth: + self.mdlutl.pkgmgr.getpkg("KBaseMediaPkg").build_package(solution["media"]) + current_media_target_solution["growth"] = self.mdlutl.model.slim_optimize() + logger.info(f"Growth: {str(current_media_target_solution['growth'])} {solution['media'].id}") + # Adding the gapfilling solution data to the model, which is needed for saving the model in KBase + self.mdlutl.add_gapfilling(current_media_target_solution) + # Testing which gapfilled reactions are needed to produce each reactant in the objective function + self.cumulative_gapfilling.extend(cumulative_solution) + return current_media_target_solution + + def compute_reaction_weights_from_expression_data(self, omics_data, annoont): + """Computing reaction weights based on input gene-level omics data + Parameters + ---------- + omics_data : pandas dataframe with genes as rows and conditions as columns + Specifies the reactions to be added to the model to implement the gapfilling solution + annoont : annoont object + Contains reaction, feature id, ontologies, probabilities. Restructured into dataframe in function + Returns : + A dictionary with Rxns as the keys and calculated result as the value. + """ + + ### Restructure annoont into Dataframe + rows_list = [] + for reaction, genes in annoont.get_reaction_gene_hash(feature_type="gene").items(): + for gene, gene_info in genes.items(): + # Initialize the row with 'Gene' and 'Reactions' + row = {"Gene": gene, "Reactions": reaction} + # Loop through each evidence in the gene's evidence list + for evidence in gene_info["evidence"]: + # Construct column name from the event and ontology for uniqueness + column_name = f"{evidence['ontology']}" + if column_name in row: + row[column_name] = f"{row[column_name]}, {evidence['term']}" + else: + row[column_name] = evidence["term"] + rows_list.append(row) + restructured_anoot = pd.DataFrame(rows_list) + + ### Integrate Omics, set weights, find indexes for features + feature_ids_set = set(omics_data.index) + + # Find indices where 'Gene' values are in 'feature_ids' + # isin method returns a boolean series that is True where tbl_supAno['Gene'] is in feature_ids_set + mask = restructured_anoot["Gene"].isin(feature_ids_set) + # Get the indices of True values in the mask + idx_measuredGene = mask[mask].index.tolist() + # Calculate the dimensions for the measuredGeneScore array + num_genes = len(restructured_anoot["Gene"]) + num_columns = len(restructured_anoot.columns[2:]) + # Initialize the measuredGeneScore array with zeros + measuredGeneScore = np.zeros((num_genes, num_columns)) + measuredGeneScore[idx_measuredGene, :] = 1 + num_weights = len(restructured_anoot.columns[3:]) + w = np.repeat(1 / num_weights, num_weights) + + ### Calculate Weights and generate the reaction/weight hash + num_cols = len(restructured_anoot.columns[2:]) + w = np.full((num_cols, 1), 1 / num_cols) + p = np.zeros(len(restructured_anoot["Reactions"])) + # computed_weights is the rxn_hash ({rxn: weight, ...}) + computed_weights = {} + + # Precompute gene reaction lookups + gene_reaction_lookup = {} + for idx, row in restructured_anoot.iterrows(): + gene = row["Gene"] + reaction = row["Reactions"] + if gene in gene_reaction_lookup: + gene_reaction_lookup[gene].append(reaction) else: - rxn.upper_bound = 0 - rxn.lower_bound = -100 - return self.model + gene_reaction_lookup[gene] = [reaction] + + for rxn in range(0, len(restructured_anoot)): + substr_rxns = [rxn for rxn in restructured_anoot["Reactions"][[rxn]]] + # Get the indices of the rows where the condition is True + mask = restructured_anoot["Reactions"] == substr_rxns[0] + idx_gene = mask[mask].index + nAG = 0 + nMG = 0 + nCG = 0 + + if len(idx_gene) > 0: + # number of genes that map to a reaction + nAG = len(idx_gene) + for iGene in range(0, nAG): + subset = restructured_anoot.iloc[idx_gene[iGene], 2:].to_numpy() + # Checking for non-empty elements in the subset + non_empty_check = np.vectorize(lambda x: x is not None and x == x)( + subset + ) + # Finding the maximum value between the non-empty check and the corresponding row in measuredGeneScore + max_value = np.maximum( + non_empty_check, measuredGeneScore[idx_gene[iGene], :] + ) + # Multiplying by the weight and adding to nMG + nMG += max(sum((max_value * w))) + selected_gene = restructured_anoot["Gene"].iloc[idx_gene[iGene]] + + # Finding reactions associated with genes that contain the selected gene + associated_reactions = gene_reaction_lookup.get(selected_gene, []) + + # Checking if there are more than one unique reactions + if len(associated_reactions) > 1: + nCG += 1 + + p[rxn] = (nMG / nAG) * (1 / (1 + (nCG / nAG))) + + # Add item to output rxn hash dictionary + computed_weights[restructured_anoot.iloc[rxn, 0]] = p[rxn] + + return computed_weights @staticmethod def gapfill( diff --git a/modelseedpy/core/msgenome.py b/modelseedpy/core/msgenome.py index f052130d..1743c390 100644 --- a/modelseedpy/core/msgenome.py +++ b/modelseedpy/core/msgenome.py @@ -8,6 +8,24 @@ DEFAULT_SPLIT = " " +def to_fasta(features, filename, line_size=80, fn_header=None): + with open(filename, "w") as fh: + for feature in features: + if feature.seq: + h = f">{feature.id}\n" + if fn_header: + h = fn_header(feature) + fh.write(h) + _seq = feature.seq + lines = [ + _seq[i : i + line_size] + "\n" + for i in range(0, len(_seq), line_size) + ] + for line in lines: + fh.write(line) + return filename + + def normalize_role(s): s = s.strip().lower() s = re.sub(r"[\W_]+", "", s) @@ -15,8 +33,25 @@ def normalize_role(s): def read_fasta(f, split=DEFAULT_SPLIT, h_func=None): - with open(f, "r") as fh: - return parse_fasta_str(fh.read(), split, h_func) + if f.endswith(".gz"): + import gzip + + with gzip.open(f, "rb") as fh: + return parse_fasta_str(fh.read().decode("utf-8"), split, h_func) + else: + with open(f, "r") as fh: + return parse_fasta_str(fh.read(), split, h_func) + + +def read_fasta2(f, split=DEFAULT_SPLIT, h_func=None): + if f.endswith(".gz"): + import gzip + + with gzip.open(f, "rb") as fh: + return extract_features(fh.read().decode("utf-8"), split, h_func) + else: + with open(f, "r") as fh: + return extract_features(fh.read(), split, h_func) def parse_fasta_str(faa_str, split=DEFAULT_SPLIT, h_func=None): @@ -47,8 +82,60 @@ def parse_fasta_str(faa_str, split=DEFAULT_SPLIT, h_func=None): return features +def read_gbff_records_from_file(filename: str): + if filename.endswith(".gbff"): + with open(filename, "r") as fh: + return read_gbff_records(fh) + elif filename.endswith(".gz"): + import gzip + from io import StringIO + + with gzip.open(filename, "rb") as fh: + return read_gbff_records(StringIO(fh.read().decode("utf-8"))) + + +def read_gbff_records(handler): + from Bio import SeqIO + + gbff_records = [] + for record in SeqIO.parse(handler, "gb"): + gbff_records.append(record) + return gbff_records + + +def extract_features(faa_str, split=DEFAULT_SPLIT, h_func=None): + features = [] + active_seq = None + seq_lines = [] + for line in faa_str.split("\n"): + if line.startswith(">"): + if active_seq is not None: + active_seq.seq = "".join(seq_lines) + features.append(active_seq) + seq_lines = [] + seq_id = line[1:] + desc = None + if h_func: + seq_id, desc = h_func(seq_id) + elif split: + header_data = line[1:].split(split, 1) + seq_id = header_data[0] + if len(header_data) > 1: + desc = header_data[1] + active_seq = MSFeature(seq_id, "", desc) + else: + seq_lines.append(line.strip()) + + # add last sequence + if len(seq_lines) > 0: + active_seq.seq = "".join(seq_lines) + features.append(active_seq) + + return features + + class MSFeature: - def __init__(self, feature_id, sequence, description=None): + def __init__(self, feature_id, sequence, description=None, aliases=[]): """ @param feature_id: identifier for the protein coding feature @@ -60,7 +147,7 @@ def __init__(self, feature_id, sequence, description=None): self.seq = sequence self.description = description # temporary replace with proper parsing self.ontology_terms = {} - self.aliases = [] + self.aliases = aliases def add_ontology_term(self, ontology_term, value): """ @@ -78,6 +165,9 @@ def add_ontology_term(self, ontology_term, value): class MSGenome: def __init__(self): self.features = DictList() + self.id = None + self.annoont = None + self.scientific_name = None def add_features(self, feature_list: list): """ @@ -96,14 +186,77 @@ def add_features(self, feature_list: list): self.features += feature_list + def create_new_feature(self,id,sequence): + newftr = MSFeature(id,sequence) + self.add_features([newftr]) + return newftr + @staticmethod - def from_fasta( - filename, contigs=0, split="|", h_func=None - ): # !!! the contigs argument is never used + def from_annotation_ontology( + annoont, prioritized_event_list=None, ontologies=None, merge_all=False,feature_type=None, translate_to_rast=True + ): + gene_hash = annoont.get_gene_term_hash() genome = MSGenome() - genome.features += read_fasta(filename, split, h_func) + features = [] + for gene in gene_hash: + feature = MSFeature(gene.id,"") + features.append(feature) + for term in gene_hash[gene]: + feature.add_ontology_term(term.ontology.id, term.id) + if term.ontology.id == "SSO": + feature.add_ontology_term("RAST",annoont.get_term_name(term)) + genome.add_features(features) return genome + @staticmethod + def from_fasta(filename, split=" ", h_func=None): + genome = MSGenome() + genome.features += read_fasta2(filename, split, h_func) + return genome + + @staticmethod + def from_gbff_sequence(filename): + gbff_records = read_gbff_records_from_file(filename) + genome = MSGenome() + features = [] + for rec in gbff_records: + feature = MSFeature(rec.id, str(rec.seq), description=rec.description) + features.append(feature) + genome.features += features + return genome + + @staticmethod + def from_gbff_features( + filename, feature_id_qualifier="protein_id", description_qualifier="product" + ): + gbff_records = read_gbff_records_from_file(filename) + genome = MSGenome() + features = [] + for rec in gbff_records: + for f in rec.features: + if f.type == "CDS": + translations = f.qualifiers.get("translation", []) + if len(translations) == 1: + feature_id = f.qualifiers.get(feature_id_qualifier, [None])[0] + description = f.qualifiers.get(description_qualifier, [None])[0] + if feature_id: + feature = MSFeature( + feature_id, translations[0], description=description + ) + features.append(feature) + else: + logger.warning( + f"skip feature: unable to fetch id from qualifier {feature_id_qualifier}" + ) + elif len(translations) > 1: + logger.warning(f"skip feature: with multiple sequences {f}") + genome.features += features + return genome + + def to_fasta(self, filename, l=80, fn_header=None): + to_fasta(self.features, filename, l, fn_header) + return filename + @staticmethod def from_dna_fasta(filename): pass @@ -116,10 +269,114 @@ def from_protein_sequences_hash(sequences): return genome def alias_hash(self): - return {alias: gene for gene in self.features for alias in gene.aliases} + output = {} + for gene in self.features: + for alias in gene.aliases: + #Check if alias is a list + if isinstance(alias,list): + if alias[1] not in output: + output[alias[1]] = gene + else: + if alias not in output: + output[alias] = gene + return output def search_for_gene(self, query): if query in self.features: return self.features.get_by_id(query) aliases = self.alias_hash() return aliases[query] if query in aliases else None + + def _repr_html_(self): + return f""" + + + + + + + + +
Memory address{f"{id(self):x}"}
Features{len(self.features)}
""" + + +class GenomeGff(MSGenome): + def __init__(self, contigs): + self.contigs = contigs + super().__init__() + + @staticmethod + def read_sequence(feature_id, gff_record, expected_sequence, contigs): + from Bio.Seq import Seq + from Bio import Align + + protein_seq_cds = expected_sequence + feature_contig = contigs.features.get_by_id(gff_record.contig_id) + seq = Seq(feature_contig.seq[gff_record.start - 1 : gff_record.end]) + if gff_record.strand == "-": + seq = seq.reverse_complement() + seq_from_dna = str(seq.translate()) + if len(seq_from_dna) > 0 and seq_from_dna[-1] == "*": + seq_from_dna = seq_from_dna[:-1] + if len(protein_seq_cds) > 0 and protein_seq_cds[-1] == "*": + protein_seq_cds = protein_seq_cds[:-1] + eq = protein_seq_cds == seq_from_dna + + score = None + if not eq and len(seq_from_dna) > 0: + try: + aligner = Align.PairwiseAligner() + res = aligner.align(protein_seq_cds, seq_from_dna) + score = res.score + except ValueError as ex: + print("error", gff_record) + raise ex + + feature = MSFeature(feature_id, protein_seq_cds) + feature.description = f"score: {score}" + feature.gff = gff_record + return feature + + @staticmethod + def from_fna_faa_gff( + filename_fna, filename_faa, filename_gff, _fn_get_id, prodigal=False + ): + genome_gff_features = _read_gff_features(filename_gff) + genome_faa = MSGenome.from_fasta(filename_faa) + contigs = MSGenome.from_fasta(filename_fna) + + feature_lookup = {} + if prodigal: + for feature in genome_faa.features: + attr = dict( + x.split("=") + for x in feature.description.split(" # ")[-1].split(";") + ) + if attr["ID"] not in feature_lookup: + feature_lookup[attr["ID"]] = feature + else: + raise ValueError("") + else: + feature_lookup = {feature.id: feature for feature in genome_faa.features} + + features = [] + for gff_record in genome_gff_features: + if gff_record.feature_type == "CDS": + feature_id = gff_record.attr.get("ID") + if _fn_get_id: + feature_id = _fn_get_id(gff_record) + + feature_cds = feature_lookup.get(feature_id) + + if feature_cds: + protein_seq_cds = feature_cds.seq + f = GenomeGff.read_sequence( + feature_id, gff_record, protein_seq_cds, contigs + ) + features.append(f) + else: + print(f"not found {feature_id}") + + genome = GenomeGff(contigs) + genome.features += features + return genome diff --git a/modelseedpy/core/msgrowthphenotypes.py b/modelseedpy/core/msgrowthphenotypes.py old mode 100644 new mode 100755 index 6c30bb2a..debf63c5 --- a/modelseedpy/core/msgrowthphenotypes.py +++ b/modelseedpy/core/msgrowthphenotypes.py @@ -5,140 +5,576 @@ from cobra.core.dictlist import DictList from modelseedpy.core.msmedia import MSMedia from modelseedpy.fbapkg.mspackagemanager import MSPackageManager +from modelseedpy.fbapkg.objectivepkg import ObjectiveData from modelseedpy.core.msmodelutl import MSModelUtil from modelseedpy.core.msgapfill import MSGapfill +from modelseedpy.core.msmedia import MSMedia logger = logging.getLogger(__name__) +logger.setLevel( + logging.INFO +) # When debugging - set this to INFO then change needed messages below from DEBUG to INFO +zero_threshold = 0.0000001 class MSGrowthPhenotype: def __init__( self, id, - media=None, - growth=None, - gene_ko=[], - additional_compounds=[], - parent=None, + base_media=None, + experimental_value=None, + experimental_value_is_binary=False, + knockouts=[], + additional_compounds={}, + primary_compounds=[], name=None, + gene_association_scores={}, + objective=ObjectiveData.from_string("MAX{bio1}"), + target_element=None, + target_element_limit=10, + parent=None ): self.id = id self.name = name if name == None: self.name = self.id - self.growth = growth - self.media = media - self.gene_ko = gene_ko - self.gapfilling = None + self.experimental_value = experimental_value + self.experimental_value_is_binary = experimental_value_is_binary + self.base_media = base_media + self.knockouts = knockouts self.additional_compounds = additional_compounds + self.primary_compounds = primary_compounds + self.target_element = target_element + self.gene_association_scores = gene_association_scores + self.objective = objective self.parent = parent + def to_dict(self, media_output_type="complete"): + """ + Convert MSGrowthPhenotype to a dictionary. + + Parameters: + media_output_type (str): Output type for media serialization. + Options: "minimal", "bounds", "complete" (default: "complete") + + Returns: + dict: Dictionary representation of the phenotype + """ + output = { + 'id': self.id, + 'name': self.name, + 'experimental_value': self.experimental_value, + 'experimental_value_is_binary': self.experimental_value_is_binary, + 'knockouts': list(self.knockouts) if self.knockouts else [], + 'additional_compounds': dict(self.additional_compounds) if self.additional_compounds else {}, + 'primary_compounds': list(self.primary_compounds) if self.primary_compounds else [], + 'gene_association_scores': dict(self.gene_association_scores) if self.gene_association_scores else {}, + 'target_element': self.target_element, + 'target_element_limit': getattr(self, 'target_element_limit', 10), + 'objective': self.objective.to_string() if self.objective else None, + } + + # Serialize base_media if present + if self.base_media: + output['base_media'] = self.base_media.to_dict(output_type=media_output_type) + output['base_media_id'] = self.base_media.id + output['base_media_name'] = self.base_media.name + output['base_media_ref'] = self.base_media.media_ref + + return output + + @staticmethod + def from_dict(data, parent=None): + """ + Create MSGrowthPhenotype from a dictionary. + + Parameters: + data (dict): Dictionary containing phenotype data + parent (MSGrowthPhenotypes, optional): Parent phenotype set + + Returns: + MSGrowthPhenotype: A new MSGrowthPhenotype instance + """ + # Reconstruct base_media if present + base_media = None + if 'base_media' in data and data['base_media']: + base_media = MSMedia.from_dict(data['base_media']) + base_media.id = data.get('base_media_id', 'media') + base_media.name = data.get('base_media_name', '') + base_media.media_ref = data.get('base_media_ref') + + # Reconstruct objective + objective = None + if 'objective' in data and data['objective']: + objective = ObjectiveData.from_string(data['objective']) + + return MSGrowthPhenotype( + id=data.get('id'), + base_media=base_media, + experimental_value=data.get('experimental_value'), + experimental_value_is_binary=data.get('experimental_value_is_binary', False), + knockouts=data.get('knockouts', []), + additional_compounds=data.get('additional_compounds', {}), + primary_compounds=data.get('primary_compounds', []), + name=data.get('name'), + gene_association_scores=data.get('gene_association_scores', {}), + objective=objective, + target_element=data.get('target_element'), + target_element_limit=data.get('target_element_limit', 10), + parent=parent + ) + def build_media(self): + """Builds media object to use when simulating the phenotype + Parameters + ---------- + include_base_media : bool + Indicates whether to include the base media for the phenotype set in the formulation + """ cpd_hash = {} + if self.base_media: + cpd_hash = self.base_media.to_dict() for cpd in self.additional_compounds: - cpd_hash[cpd] = 100 + cpd_hash[cpd] = self.additional_compounds[cpd] + for cpd in self.primary_compounds: + if cpd not in cpd_hash: + cpd_hash[cpd] = 100 # Default flux for primary compounds full_media = MSMedia.from_dict(cpd_hash) - if self.media != None: - full_media.merge(self.media, overwrite_overlap=False) - if self.parent != None and self.parent.base_media != None: - full_media.merge(parent.base_media, overwrite_overlap=False) + if self.parent and self.parent.base_media: + print("Adding parent base media to phenotype media") + full_media.merge(self.parent.base_media, overwrite_overlap=False) return full_media + def configure_model_for_phenotype(self,model_or_mdlutl,add_missing_exchanges=True): + """Configures the model to run this phenotype + Parameters + ---------- + model_or_modelutl : Model | MSModelUtl + Model to use to run the simulations + """ + output = {"baseline_objective":0.01} + #Translating model is not MSModelUtil + modelutl = model_or_mdlutl + if not isinstance(model_or_mdlutl, MSModelUtil): + modelutl = MSModelUtil.get(model_or_mdlutl) + #Setting the phenotype objective + modelutl.pkgmgr.getpkg("ObjectivePkg").build_package( + objective_or_string=self.objective, + objective_name=self.name, + set_objective=True + ) + # Setting media in model + modelutl.pkgmgr.getpkg("KBaseMediaPkg").build_package( + self.build_media(), self.parent.base_uptake, self.parent.base_excretion + ) + # Adding transport reactions + if add_missing_exchanges: + ex_output = modelutl.add_missing_exchanges(self.build_media()) + output["missing_transports"] = ex_output + # Adding elemental constraints + if self.target_element: + print("Target element: "+self.target_element) + #Computing baseline growth + reaction_exceptions = [] + modelutl.pkgmgr.getpkg("ElementUptakePkg").build_package( + {self.target_element:self.target_element_limit}, exception_reactions=reaction_exceptions + ) + output["baseline_objective"] = modelutl.model.slim_optimize() + #Resetting elemental constraints with exception reactions + exchange_hash = modelutl.exchange_hash() + for item in self.primary_compounds: + if item in exchange_hash: + for rxn in exchange_hash[item]: + if rxn not in reaction_exceptions: + reaction_exceptions.append(rxn) + modelutl.pkgmgr.getpkg("ElementUptakePkg").clear() + modelutl.pkgmgr.getpkg("ElementUptakePkg").build_package( + {self.target_element:self.target_element_limit}, exception_reactions=reaction_exceptions + ) + return output + def simulate( self, - modelutl, - growth_threshold=0.001, + model_or_mdlutl, add_missing_exchanges=False, - save_fluxes=False, - pfba=False, + growth_threshold=0.01, + gapfilling=False, + msgapfill=None, + annoont=None, ): - if not isinstance(modelutl, MSModelUtil): - modelutl = MSModelUtil(modelutl) - media = self.build_media() - output = {"growth": None, "class": None, "missing_transports": []} - if add_missing_exchanges: - output["missing_transports"] = modelutl.add_missing_exchanges(media) - pkgmgr = MSPackageManager.get_pkg_mgr(modelutl.model) - pkgmgr.getpkg("KBaseMediaPkg").build_package( - media, self.parent.base_uptake, self.parent.base_excretion - ) - for gene in self.gene_ko: - if gene in modelutl.model.genes: - geneobj = modelutl.model.genes.get_by_id(gene) - geneobj.knock_out() - solution = modelutl.model.optimize() - output["growth"] = solution.objective_value - if solution.objective_value > 0 and pfba: - solution = cobra.flux_analysis.pfba(modelutl.model) - if save_fluxes: - output["fluxes"] = solution.fluxes - if output["growth"] >= growth_threshold: - if self.growth > 0: - output["class"] = "CP" + """Simulates a single phenotype + Parameters + ---------- + model_or_modelutl : Model | MSModelUtl + Model to use to run the simulations + add_missing_exchanges : bool + Boolean indicating if exchanges for compounds mentioned explicitly in phenotype media should be added to the model automatically + multiplier : double + Indicates a multiplier to use for positive growth above the growth on baseline media + save_fluxes : bool + Indicates if the fluxes should be saved and returned with the results + pfba : bool + Runs pFBA to compute fluxes after initially solving for growth + ignore_experimental_data : bool + Indicates if existing growth data in the phenotype should be ignored when computing the class of the simulated phenotype + """ + output = { + "objective_value": 0, + "experimental_value": self.experimental_value, + "class": "N", + "reactions":None, + "gfreactions":None, + "gapfill_count":0, + "gapfill_count_with_genes":0, + "reaction_count":0, + "fluxes":None, + "objective_string":self.objective.to_string() + } + #Translating model is not MSModelUtil + modelutl = model_or_mdlutl + if not isinstance(model_or_mdlutl, MSModelUtil): + modelutl = MSModelUtil.get(model_or_mdlutl) + target_mdlutl = modelutl + #Switching target model to msgapfill if gapfilling is True + if gapfilling: + if msgapfill == None: + logger.warning( + "MSGapfill must be provided in order to run phenotype gapfilling analysis!" + ) + return None + target_mdlutl = msgapfill.gfmodelutl + #Setting the default score for all model reactions to 0.1 + reaction_scores = {} + for rxn in target_mdlutl.model.reactions: + reaction_scores[rxn.id] = {} + if rxn.id in modelutl.model.reactions: + + if modelutl.model.reactions.get_by_id(rxn.id).upper_bound > 0: + reaction_scores[rxn.id][">"] = 0.01 + else: + reaction_scores[rxn.id][">"] = 2 + if modelutl.model.reactions.get_by_id(rxn.id).lower_bound > 0: + reaction_scores[rxn.id]["<"] = 0.01 + else: + reaction_scores[rxn.id]["<"] = 2 + else: + reaction_scores[rxn.id][">"] = 2 + reaction_scores[rxn.id]["<"] = 2 + #Computing gene associations and reaction scores from annotation ontology + rxn_gene_hash = {} + if annoont != None: + rxn_gene_hash = annoont.get_reaction_gene_hash(feature_type="gene") + direction_list = [">","<"] + for rxn in rxn_gene_hash: + rxnid = rxn+"_c0" + if rxnid not in reaction_scores: + reaction_scores[rxnid] = {">": 2, "<": 2} + for direction in direction_list: + current_score = reaction_scores[rxnid][direction] + rxn_score = None + for gene in rxn_gene_hash[rxn]: + new_score = rxn_gene_hash[rxn][gene]["probability"] + if gene in self.gene_association_scores: + new_score = -1*(1-self.gene_association_scores[gene])*new_score + if rxn_score == None or (new_score < 0 and new_score < rxn_score) or (current_score > 0.1 and rxn_score > 0 and new_score > rxn_score): + rxn_score = new_score + if rxn_score != None: + if rxn_score > 0: + rxn_score = 1+rxn_score + reaction_scores[rxnid][direction] = rxn_score + #Configuring the model for the phenotype + configuration_output = self.configure_model_for_phenotype(target_mdlutl,add_missing_exchanges=add_missing_exchanges) + output["missing_transports"] = configuration_output["missing_transports"] + output["baseline_objective"] = configuration_output["baseline_objective"] + multiplier = 3 + with target_mdlutl.model: + #Implementing knockouts + for item in self.knockouts: + if item in target_mdlutl.model.genes: + geneobj = target_mdlutl.model.genes.get_by_id(item) + geneobj.knock_out() + elif item in target_mdlutl.model.reactions: + rxnobj = target_mdlutl.model.reactions.get_by_id(item) + rxnobj.knock_out() + else: + logger.warning("Gene or reaction "+item+" not found in model") + #Print model LP for debugging + #modelutl.printlp(path="LP_files/",filename=self.id,print=True) + #Getting objective value + solution = target_mdlutl.model.optimize() + print(self.id,solution.status,solution.objective_value) + modelutl.printlp(path="LP_files/",filename="Base-"+self.id,print=True) + output["objective_value"] = solution.objective_value + if output["objective_value"] < 0.000001: + output["objective_value"] = 0 + if gapfilling: + output["status"] = "gapfilling failed" + else: + output["status"] = "no growth without gapfilling" else: + output["class"] = "P" + target_mdlutl.model.reactions.get_by_id("bio1").lower_bound = output["objective_value"] * multiplier + original_objective = target_mdlutl.model.objective + coefobj = target_mdlutl.model.problem.Objective(0, direction="min") + target_mdlutl.model.objective = coefobj + obj_coef = {} + direction_list = [">","<"] + for rxn in reaction_scores: + if rxn in target_mdlutl.model.reactions: + rxnobj = target_mdlutl.model.reactions.get_by_id(rxn) + for direction in direction_list: + if direction == ">": + obj_coef[rxnobj.forward_variable] = reaction_scores[rxn][direction] + elif direction == "<": + obj_coef[rxnobj.reverse_variable] = reaction_scores[rxn][direction] + coefobj.set_linear_coefficients(obj_coef) + modelutl.printlp(path="LP_files/",filename=self.id,print=True) + solution = target_mdlutl.model.optimize() + target_mdlutl.model.objective = original_objective + target_mdlutl.model.reactions.get_by_id("bio1").lower_bound = 0 + #Processing solution + output["fluxes"] = {} + output["reactions"] = [] + output["gfreactions"] = {} + for rxn in target_mdlutl.model.reactions: + if rxn.id in solution.fluxes: + flux = solution.fluxes[rxn.id] + if abs(flux) > 0.000001: + output["fluxes"][rxn.id] = flux + if rxn.id[0:3] != "bio" and rxn.id[0:3] != "EX_" and rxn.id[0:3] != "DM_" and rxn.id[0:3] != "SK": + output["reaction_count"] += 1 + output["reactions"].append(rxn.id) + if rxn.id not in modelutl.model.reactions or (flux < -0.000001 and modelutl.model.reactions.get_by_id(rxn.id).lower_bound == 0) or (flux > 0.000001 and modelutl.model.reactions.get_by_id(rxn.id).upper_bound == 0): + output["gapfill_count"] += 1 + output["gfreactions"][rxn.id] = None + if rxn.id in rxn_gene_hash and len(rxn_gene_hash[rxn.id]) > 0: + output["gfreactions"][rxn.id] = list(rxn_gene_hash[rxn.id].keys()) + output["gapfill_count_with_genes"] += 1 + # Determining phenotype class + multiplier = 3 + ignore_experimental_data = False + if output["objective_value"] != None and output["objective_value"] >= output["baseline_objective"] * multiplier: + output["postive"] = True + if self.experimental_value == None or ignore_experimental_data: + output["class"] = "P" + elif self.experimental_value > 0: + output["class"] = "CP" + elif self.experimental_value == 0: output["class"] = "FP" else: - if self.growth > 0: + output["postive"] = False + if self.experimental_value == None or ignore_experimental_data: + output["class"] = "N" + elif self.experimental_value > 0: output["class"] = "FN" - else: + elif self.experimental_value == 0: output["class"] = "CN" return output def gapfill_model_for_phenotype( self, - modelutl, - default_gapfill_templates, + msgapfill, test_conditions, - default_gapfill_models=[], - blacklist=[], - growth_threshold=0.001, + multiplier=10, add_missing_exchanges=False, ): - if not isinstance(modelutl, MSModelUtil): - modelutl = MSModelUtil(modelutl) - self.gapfilling = MSGapfill( - modelutl.model, - default_gapfill_templates, - default_gapfill_models, - test_conditions, - modelutl.reaction_scores(), - blacklist, + """Gapfills the model to permit this single phenotype to be positive + Parameters + ---------- + msgapfill : MSGapfill + Fully configured gapfilling object + add_missing_exchanges : bool + Boolean indicating if exchanges for compounds mentioned explicitly in phenotype media should be added to the model automatically + multiplier : double + Indicates a multiplier to use for positive growth above the growth on baseline media + objective : string + Expression for objective to be activated by gapfilling + """ + # First simulate model without gapfilling to assess ungapfilled growth + output = self.simulate( + msgapfill.mdlutl,multiplier, add_missing_exchanges ) - media = self.build_media() - if add_missing_exchanges: - modelutl.add_missing_exchanges(media) - for gene in self.gene_ko: - if gene in modelutl.model.genes: - geneobj = modelutl.model.genes.get_by_id(gene) - geneobj.knock_out() - gfresults = self.gapfilling.run_gapfilling(media, None) - if gfresults is None: + if output["objective_value"] >= output["baseline_objective"] * multiplier: + # No gapfilling needed - original model grows without gapfilling + return { + "reversed": {}, + "new": {}, + "media": self.build_media(), + "target": output["objective"], + "minobjective": output["baseline_objective"] * multiplier, + "binary_check": False, + } + + # Now pulling the gapfilling configured model from MSGapfill + gfmodelutl = MSModelUtil.get(msgapfill.gfmodel) + # Saving the gapfill objective because this will be replaced when the simulation runs + gfobj = gfmodelutl.model.objective + # Running simulate on gapfill model to add missing exchanges and set proper media and uptake limit constraints + output = self.simulate( + gfmodelutl, multiplier=multiplier, add_missing_exchanges=add_missing_exchanges + ) + # If the gapfilling model fails to achieve the minimum growth, then no solution exists + if output["objective_value"] < output["baseline_objective"] * multiplier: logger.warning( "Gapfilling failed with the specified model, media, and target reaction." ) - return self.gapfilling.integrate_gapfill_solution(gfresults) + return None + + # Running the gapfilling itself + full_media = self.build_media() + with gfmodelutl.model: + # Applying gene knockouts + for gene in self.gene_ko: + if gene in gfmodelutl.model.genes: + geneobj = gfmodelutl.model.genes.get_by_id(gene) + geneobj.knock_out() + gfresults = self.gapfilling.run_gapfilling( + full_media, None, minimum_obj=output["baseline_objective"] * multiplier + ) + if gfresults is None: + logger.warning( + "Gapfilling failed with the specified model, media, and target reaction." + ) + + return gfresults class MSGrowthPhenotypes: - def __init__(self, base_media=None, base_uptake=0, base_excretion=1000): + def __init__( + self, base_media=None, base_uptake=0, base_excretion=1000, global_atom_limits={}, id=None, name=None, source=None, source_id=None, type=None + ): + # Check if base_media is a MSMedia object + if not isinstance(base_media, MSMedia) and base_media is not None: + base_media = MSMedia.from_kbase_object(base_media) + self.id = id + self.name = name + self.source = source + self.source_id = source_id + self.type = type self.base_media = base_media self.phenotypes = DictList() self.base_uptake = base_uptake self.base_excretion = base_excretion + self.atom_limits = global_atom_limits + self.baseline_objective_data = {} + self.cached_based_growth = {} + + def to_dict(self, media_output_type="complete"): + """ + Convert MSGrowthPhenotypes to a dictionary. + + This function serializes the entire phenotype set including all media objects, + allowing the phenotype set to be saved locally and restored later without + needing to re-fetch from KBase. + + Parameters: + media_output_type (str): Output type for media serialization. + Options: "minimal", "bounds", "complete" (default: "complete") + + Returns: + dict: Dictionary representation of the phenotype set + """ + output = { + 'id': self.id, + 'name': self.name, + 'source': self.source, + 'source_id': self.source_id, + 'type': self.type, + 'base_uptake': self.base_uptake, + 'base_excretion': self.base_excretion, + 'atom_limits': dict(self.atom_limits) if self.atom_limits else {}, + } + + # Serialize base_media if present + if self.base_media: + output['base_media'] = self.base_media.to_dict(output_type=media_output_type) + output['base_media_id'] = self.base_media.id + output['base_media_name'] = self.base_media.name + output['base_media_ref'] = self.base_media.media_ref + + # Serialize all phenotypes + output['phenotypes'] = [] + for pheno in self.phenotypes: + output['phenotypes'].append(pheno.to_dict(media_output_type=media_output_type)) + + return output + + @staticmethod + def from_dict(data): + """ + Create MSGrowthPhenotypes from a dictionary. + + This function reconstructs the entire phenotype set from a dictionary, + including all media objects. Use this to load a phenotype set that was + previously saved using to_dict(). + + Parameters: + data (dict): Dictionary containing phenotype set data + + Returns: + MSGrowthPhenotypes: A new MSGrowthPhenotypes instance + """ + # Reconstruct base_media if present + base_media = None + if 'base_media' in data and data['base_media']: + base_media = MSMedia.from_dict(data['base_media']) + base_media.id = data.get('base_media_id', 'media') + base_media.name = data.get('base_media_name', '') + base_media.media_ref = data.get('base_media_ref') + + # Create the phenotype set + growthpheno = MSGrowthPhenotypes( + base_media=base_media, + base_uptake=data.get('base_uptake', 0), + base_excretion=data.get('base_excretion', 1000), + global_atom_limits=data.get('atom_limits', {}), + id=data.get('id'), + name=data.get('name'), + source=data.get('source'), + source_id=data.get('source_id'), + type=data.get('type') + ) + + # Reconstruct all phenotypes + new_phenos = [] + for pheno_data in data.get('phenotypes', []): + newpheno = MSGrowthPhenotype.from_dict(pheno_data, parent=growthpheno) + new_phenos.append(newpheno) + + growthpheno.add_phenotypes(new_phenos) + return growthpheno @staticmethod - def from_compound_hash(compounds, base_media, base_uptake=0, base_excretion=1000): - growthpheno = MSGrowthPhenotypes(base_media, base_uptake, base_excretion) + def from_compound_hash( + compounds, + base_media=None, + base_uptake=0, + base_excretion=1000, + global_atom_limits={}, + type="growth" + ): + growthpheno = MSGrowthPhenotypes( + base_media=base_media, base_uptake=base_uptake, base_excretion=base_excretion, global_atom_limits=global_atom_limits, id=None, name=None, source=None, source_id=None, type=None + ) new_phenos = [] for cpd in compounds: - newpheno = MSGrowthPhenotype(cpd, None, compounds[cpd], [], [cpd]) + newpheno = MSGrowthPhenotype(cpd,media=None,experimental_value=compounds[cpd],gene_ko=[],additional_compounds=[cpd],type=type) new_phenos.append(newpheno) growthpheno.add_phenotypes(new_phenos) return growthpheno @staticmethod - def from_kbase_object(data, kbase_api): - growthpheno = MSGrowthPhenotypes(None, 0, 1000) + def from_kbase_object( + data, + kbase_api, + base_media=None, + base_uptake=0, + base_excretion=1000, + global_atom_limits={}, + ): + growthpheno = MSGrowthPhenotypes( + base_media=base_media, base_uptake=base_uptake, base_excretion=base_excretion, global_atom_limits=global_atom_limits, id=data["id"], name=data["name"], source=data["source"], source_id=data["source_id"], type=data["type"] + ) new_phenos = [] for pheno in data["phenotypes"]: media = kbase_api.get_from_ws(pheno["media_ref"], None) @@ -148,17 +584,27 @@ def from_kbase_object(data, kbase_api): added_compounds = [] for added_cpd in pheno["additionalcompound_refs"]: added_compounds.append(added_cpd.split("/").pop()) + msmedia = MSMedia.from_kbase_object(media) newpheno = MSGrowthPhenotype( - pheno["id"], media, pheno["normalizedGrowth"], geneko, added_compounds + msmedia.id,name=msmedia.name, base_media=msmedia, experimental_value=pheno["normalizedGrowth"], knockouts=geneko, additional_compounds=added_compounds,parent=growthpheno ) new_phenos.append(newpheno) growthpheno.add_phenotypes(new_phenos) return growthpheno @staticmethod - def from_kbase_file(filename, kbase_api): + def from_kbase_file( + filename, + kbase_api, + base_media=None, + base_uptake=0, + base_excretion=1000, + global_atom_limits={}, + ): # TSV file with the following headers:media mediaws growth geneko addtlCpd - growthpheno = MSGrowthPhenotypes(base_media, 0, 1000) + growthpheno = MSGrowthPhenotypes( + base_media, base_uptake, base_excretion, global_atom_limits + ) headings = [] new_phenos = [] with open(filename) as f: @@ -177,21 +623,31 @@ def from_kbase_file(filename, kbase_api): {"mediaws": None, "geneko": [], "addtlCpd": []}, ) media = kbase_api.get_from_ws(data["media"], data["mediaws"]) + msmedia = MSMedia(media.id, name=media.name) + msmedia.mediacompounds = media.mediacompounds id = data["media"] if len(data["geneko"]) > 0: id += "-" + ",".join(data["geneko"]) if len(data["addtlCpd"]) > 0: id += "-" + ",".join(data["addtlCpd"]) newpheno = MSGrowthPhenotype( - id, media, data["growth"], data["geneko"], data["addtlCpd"] + id, msmedia, data["growth"], data["geneko"], data["addtlCpd"] ) new_phenos.append(newpheno) growthpheno.add_phenotypes(new_phenos) return growthpheno @staticmethod - def from_ms_file(filename, basemedia, base_uptake=0, base_excretion=100): - growthpheno = MSGrowthPhenotypes(base_media, base_uptake, base_excretion) + def from_ms_file( + filename, + base_media=None, + base_uptake=0, + base_excretion=100, + global_atom_limits={}, + ): + growthpheno = MSGrowthPhenotypes( + base_media, base_uptake, base_excretion, global_atom_limits + ) df = pd.read_csv(filename) required_headers = ["Compounds", "Growth"] for item in required_headers: @@ -211,6 +667,36 @@ def from_ms_file(filename, basemedia, base_uptake=0, base_excretion=100): growthpheno.add_phenotypes(new_phenos) return growthpheno + def to_kbase_json(self,genome_ref): + pheno_data = { + "id": self.id, + "name": self.name, + "source": self.source, + "source_id": self.source_id, + "type": self.type, + "phenotypes": [], + "genome_ref": genome_ref + } + for pheno in self.phenotypes: + pheno_data["phenotypes"].append({ + "id": pheno.id, + "name": pheno.name, + "media_ref": pheno.base_media.media_ref, + "normalizedGrowth": pheno.experimental_value, + "geneko_refs": pheno.knockouts, + "additionalcompound_refs": pheno.additional_compounds + }) + return pheno_data + + def build_super_media(self): + super_media = None + for pheno in self.phenotypes: + if not super_media: + super_media = pheno.build_media() + else: + super_media.merge(pheno.build_media(), overwrite_overlap=False) + return super_media + def add_phenotypes(self, new_phenotypes): keep_phenos = [] for pheno in new_phenotypes: @@ -220,76 +706,282 @@ def add_phenotypes(self, new_phenotypes): additions = DictList(keep_phenos) self.phenotypes += additions + def baseline_objective(self, model_or_mdlutl, objective): + """Simulates all the specified phenotype conditions and saves results + Parameters + ---------- + model_or_modelutl : Model | MSModelUtl + Model to use to run the simulations + """ + # Discerning input is model or mdlutl and setting internal links + modelutl = model_or_mdlutl + if not isinstance(model_or_mdlutl, MSModelUtil): + modelutl = MSModelUtil.get(model_or_mdlutl) + # Checking if base growth already computed + if modelutl in self.cached_based_growth: + if objective in self.cached_based_growth[modelutl]: + return self.cached_based_growth[modelutl][objective] + else: + self.cached_based_growth[modelutl] = {} + # Setting objective + modelutl.objective = objective + # Setting media + modelutl.pkgmgr.getpkg("KBaseMediaPkg").build_package( + self.base_media, self.base_uptake, self.base_excretion + ) + # Adding uptake limits + if len(self.atom_limits) > 0: + modelutl.pkgmgr.getpkg("ElementUptakePkg").build_package(self.atom_limits) + # Simulating + self.cached_based_growth[modelutl][objective] = modelutl.model.slim_optimize() + return self.cached_based_growth[modelutl][objective] + def simulate_phenotypes( self, - model, - biomass, + model_or_mdlutl, + multiplier=3, add_missing_exchanges=False, - correct_false_negatives=False, - template=None, - growth_threshold=0.001, save_fluxes=False, + save_reaction_list=False, + gapfill_negatives=False, + msgapfill=None, + test_conditions=None, + ignore_experimental_data=False, + flux_coefficients=None, + recall_phenotypes=True ): - model.objective = biomass - modelutl = MSModelUtil(model) + """Simulates all the specified phenotype conditions and saves results + Parameters + ---------- + model_or_mdlutl : Model | MSModelUtl + Model to use to run the simulations + multiplier : double + Indicates a multiplier to use for positive growth above the growth on baseline media + add_missing_exchanges : bool + Boolean indicating if exchanges for compounds mentioned explicitly in phenotype media should be added to the model automatically + save_fluxes : bool + Indicates if the fluxes should be saved and returned with the results + ignore_experimental_data : bool + Indicates if existing growth data in the phenotype set should be ignored when computing the class of a simulated phenotype + """ + # Discerning input is model or mdlutl and setting internal links + modelutl = model_or_mdlutl + if not isinstance(model_or_mdlutl, MSModelUtil): + modelutl = MSModelUtil.get(model_or_mdlutl) + # Establishing output of the simulation method summary = { - "Label": ["Accuracy", "CP", "CN", "FP", "FN"], - "Count": [0, 0, 0, 0, 0], + "Label": ["Accuracy", "CP", "CN", "FP", "FN", "P", "N"], + "Count": [0, 0, 0, 0, 0, 0, 0], } data = { "Phenotype": [], - "Observed growth": [], - "Simulated growth": [], + "Observed objective": [], + "Simulated objective": [], "Class": [], "Transports missing": [], "Gapfilled reactions": [], + "Gapfilling score": None, } + # Running simulations + gapfilling_solutions = {} + totalcount = 0 + datahash = {"summary": {}} for pheno in self.phenotypes: - with model: - result = pheno.simulate( - modelutl, growth_threshold, add_missing_exchanges, save_fluxes - ) # Result should have "growth" and "class" - if result["class"] == "FN" and correct_false_negatives: - pheno.gapfill_model_for_phenotype(modelutl, [template], None) - if pheno.gapfilling.last_solution != None: - list = [] - for rxn_id in pheno.gapfilling.last_solution["reversed"]: - list.append( - pheno.gapfilling.last_solution["reversed"][rxn_id] - + rxn_id - ) - for rxn_id in pheno.gapfilling.last_solution["new"]: - list.append( - pheno.gapfilling.last_solution["new"][rxn_id] + rxn_id - ) - data["Gapfilled reactions"].append(";".join(list)) - else: - data["Gapfilled reactions"].append(None) + result = pheno.simulate( + modelutl, + add_missing_exchanges=add_missing_exchanges + ) + datahash[pheno.id] = result + data["Class"].append(result["class"]) + data["Phenotype"].append(pheno.id) + data["Observed objective"].append(pheno.experimental_value) + data["Simulated objective"].append(result["objective_value"]) + data["Transports missing"].append(";".join(result["missing_transports"])) + if result["class"] == "CP": + summary["Count"][1] += 1 + summary["Count"][5] += 1 + summary["Count"][0] += 1 + totalcount += 1 + elif result["class"] == "CN": + summary["Count"][2] += 1 + summary["Count"][0] += 1 + summary["Count"][6] += 1 + totalcount += 1 + elif result["class"] == "FP": + summary["Count"][3] += 1 + summary["Count"][5] += 1 + totalcount += 1 + elif result["class"] == "FN": + summary["Count"][4] += 1 + summary["Count"][6] += 1 + totalcount += 1 + elif result["class"] == "P": + summary["Count"][5] += 1 + elif result["class"] == "N": + summary["Count"][6] += 1 + # Gapfilling negative growth conditions + if gapfill_negatives and result["class"] in ["N", "FN", "CN"]: + gapfilling_solutions[pheno] = pheno.gapfill_model_for_phenotype( + msgapfill, + test_conditions, + multiplier, + add_missing_exchanges, + ) + if gapfilling_solutions[pheno] != None: + data["Gapfilling score"] = 0 + list = [] + for rxn_id in gapfilling_solutions[pheno]["reversed"]: + list.append( + gapfilling_solutions[pheno]["reversed"][rxn_id] + rxn_id + ) + data["Gapfilling score"] += 0.5 + for rxn_id in gapfilling_solutions[pheno]["new"]: + list.append(gapfilling_solutions[pheno]["new"][rxn_id] + rxn_id) + data["Gapfilling score"] += 1 + data["Gapfilled reactions"].append(";".join(list)) else: data["Gapfilled reactions"].append(None) - result = pheno.simulate( - modelutl, growth_threshold, add_missing_exchanges, save_fluxes - ) # Result should have "growth" and "class" - data["Class"].append(result["class"]) - data["Phenotype"].append(pheno.id) - data["Observed growth"].append(pheno.growth) - data["Simulated growth"].append(result["growth"]) - data["Transports missing"].append( - ";".join(result["missing_transports"]) - ) - if result["class"] == "CP": - summary["Count"][1] += 1 - summary["Count"][0] += 1 - if result["class"] == "CN": - summary["Count"][2] += 1 - summary["Count"][0] += 1 - if result["class"] == "FP": - summary["Count"][3] += 1 - if result["class"] == "FN": - summary["Count"][4] += 1 - - summary["Count"][0] = summary["Count"][0] / len(self.phenotypes) + else: + data["Gapfilled reactions"].append(None) + if totalcount == 0: + summary["Count"][0] = None + else: + summary["Count"][0] = summary["Count"][0] / totalcount + datahash["summary"]["accuracy"] = summary["Count"][0] + datahash["summary"]["CP"] = summary["Count"][1] + datahash["summary"]["CN"] = summary["Count"][2] + datahash["summary"]["FP"] = summary["Count"][3] + datahash["summary"]["FN"] = summary["Count"][4] + datahash["summary"]["P"] = summary["Count"][5] + datahash["summary"]["N"] = summary["Count"][6] sdf = pd.DataFrame(summary) df = pd.DataFrame(data) - logger.info(df) - return {"details": df, "summary": sdf} + self.adjust_phenotype_calls(df) + return {"details": df, "summary": sdf,"data":datahash} + + def adjust_phenotype_calls(self,data,baseline_objective=0.01): + lowest = data["Simulated objective"].min() + if baseline_objective < lowest: + lowest = baseline_objective + highest = data["Simulated objective"].max() + threshold = (highest-lowest)/2+lowest + if highest/(lowest+0.000001) < 1.5: + threshold = highest + grow = 0 + nogrow = 0 + change = 0 + for (i,item) in data.iterrows(): + oldclass = item["Class"] + if item["Simulated objective"] >= threshold: + grow += 1 + if item["Class"] == "N": + data.loc[i, 'Class'] = "P" + change += 1 + elif item["Class"] == "FN": + data.loc[i, 'Class'] = "CP" + change += 1 + elif item["Class"] == "CN": + data.loc[i, 'Class'] = "FP" + change += 1 + else: + nogrow += 1 + if item["Class"] == "P": + data.loc[i, 'Class'] = "N" + change += 1 + elif item["Class"] == "CP": + data.loc[i, 'Class'] = "FN" + change += 1 + elif item["Class"] == "FP": + data.loc[i, 'Class'] = "CN" + change += 1 + + def fit_model_to_phenotypes( + self, + msgapfill, + multiplier, + correct_false_positives=False, + minimize_new_false_positives=True, + atp_safe=True, + integrate_results=True, + global_gapfilling=True, + ): + + """Simulates all the specified phenotype conditions and saves results + Parameters + ---------- + msgapfill : MSGapfill + Gapfilling object used for the gapfilling process + correct_false_positives : bool + Indicates if false positives should be corrected + minimize_new_false_positives : bool + Indicates if new false positivies should be avoided + integrate_results : bool + Indicates if the resulting modifications to the model should be integrated + """ + + # Running simulations + positive_growth = [] + negative_growth = [] + for pheno in self.phenotypes: + with model: + result = pheno.simulate( + modelutl, + multiplier, + add_missing_exchanges, + save_fluxes, + ) + # Gapfilling negative growth conditions + if gapfill_negatives and result["class"] in ["N", "FN", "CN"]: + negative_growth.append(pheno.build_media()) + elif gapfill_negatives and result["class"] in ["P", "FP", "CP"]: + positive_growth.append(pheno.build_media()) + + # Create super media for all + super_media = self.build_super_media() + # Adding missing exchanges + msgapfill.gfmodel.add_missing_exchanges(super_media) + # Adding elemental constraints + self.add_elemental_constraints() + # Getting ATP tests + + # Filtering database for ATP tests + + # Penalizing database to avoid creating false positives + + # Building additional tests from current correct negatives + + # Computing base-line growth + + # Computing growth threshold + + # Running global gapfill + + # Integrating solution + + def gapfill_all_phenotypes( + self, + model_or_mdlutl, + msgapfill=None, # Needed if the gapfilling object in model utl is not initialized + threshold=None, + add_missing_exchanges=False, + ): + mdlutl = MSModelUtil.get(model_or_mdlutl) + # if msgapfill: + # mdlutl.gfutl = msgapfill + # if not mdlutl.gfutl: + # logger.critical( + # "Must either provide a gapfilling object or provide a model utl with an existing gapfilling object" + # ) + # media_list = [] + # for pheno in self.phenotypes: + # + # + # output = mdlutl.gfutl.run_multi_gapfill( + # media_list, + # default_minimum_objective=growth_threshold + # target=mdlutl.primary_biomass(), + # + # binary_check=False, + # prefilter=True, + # check_for_growth=True, + # ) diff --git a/modelseedpy/core/msmedia.py b/modelseedpy/core/msmedia.py index 488aad57..ae3053ab 100644 --- a/modelseedpy/core/msmedia.py +++ b/modelseedpy/core/msmedia.py @@ -1,16 +1,18 @@ # -*- coding: utf-8 -*- import logging from cobra.core.dictlist import DictList +from modelseedpy.core.msmodelutl import MSModelUtil logger = logging.getLogger(__name__) class MediaCompound: - def __init__(self, compound_id, lower_bound, upper_bound, concentration=None): + def __init__(self, compound_id, lower_bound, upper_bound, concentration=None, name=None): self.id = compound_id self.lower_bound = lower_bound self.upper_bound = upper_bound self.concentration = concentration + self.name = name @property def maxFlux(self): @@ -22,31 +24,134 @@ def minFlux(self): # TODO: will be removed later just for old methods return -self.upper_bound + def get_mdl_exchange_hash(self, model_or_mdlutl): + modelutl = model_or_mdlutl + if not isinstance(model_or_mdlutl, MSModelUtil): + modelutl = MSModelUtil.get(model_or_mdlutl) + mets = modelutl.find_met(self.id) + output = {} + exchange_hash = modelutl.exchange_hash() + for met in mets: + if met in exchange_hash: + output[met] = exchange_hash[met] + return output + class MSMedia: def __init__(self, media_id, name=""): self.id = media_id self.name = name + self.media_ref = None # Reference to the media object in the model self.mediacompounds = DictList() @staticmethod def from_dict(media_dict): """ - Either dict with exchange bounds (example: {'cpd00027': (-10, 1000)}) or - just absolute value of uptake (example: {''cpd00027': 10}) - :param media_dict: - :return: + Create MSMedia from a dictionary in various formats. + + Supported formats: + - Minimal: {'cpd00027': 10} - compound_id mapped to uptake rate (upper_bound) + - Bounds: {'cpd00027': (-10, 1000)} - compound_id mapped to (lower_bound, upper_bound) + - Complete: {'cpd00027': {'id': 'cpd00027', 'lower_bound': -10, 'upper_bound': 1000, + 'concentration': 5.0, 'name': 'Glucose'}} + + Parameters: + media_dict (dict): Dictionary in one of the supported formats + + Returns: + MSMedia: A new MSMedia instance """ media = MSMedia("media") media_compounds = [] + for cpd_id, v in media_dict.items(): - if isinstance(v, tuple): + if isinstance(v, dict): + # Complete format - dictionary with all fields + media_compounds.append(MediaCompound( + v.get('id', cpd_id), + v.get('lower_bound', -10), + v.get('upper_bound', 1000), + concentration=v.get('concentration'), + name=v.get('name') + )) + elif isinstance(v, tuple): + # Bounds format - tuple of (lower_bound, upper_bound) media_compounds.append(MediaCompound(cpd_id, v[0], v[1])) else: + # Minimal format - just the uptake value (upper_bound) media_compounds.append(MediaCompound(cpd_id, -v, 1000)) + media.mediacompounds += media_compounds return media + @staticmethod + def from_kbase_object(media_object): + """ + Create MSMedia from KBase media object. + :param media_object: KBase media object + :return: MSMedia instance + """ + media_id = media_object.id + media_ref = None + media_name = media_object.name if media_object.name else media_id + if media_object.info is not None: + media_id = media_object.info.id + media_name = media_object.info.id + media_ref = media_object.info.reference + output = MSMedia(media_id, name=media_name) + output.media_ref = media_ref + media_compounds = [] + for mediacpd in media_object.mediacompounds: + newmediacpd = MediaCompound(mediacpd.id, -1*mediacpd.maxFlux, -1*mediacpd.minFlux, concentration=mediacpd.concentration) + media_compounds.append(newmediacpd) + output.mediacompounds += media_compounds + return output + + def to_dict(self, output_type="minimal"): + """ + Convert MSMedia to a dictionary in various formats. + + Parameters: + output_type (str): Type of output format. Options are: + - "minimal": Dict of compound_id -> upper_bound (default) + - "bounds": Dict of compound_id -> (lower_bound, upper_bound) + - "complete": Dict of compound_id -> {all MediaCompound fields} + + Returns: + dict: Dictionary representation of the media in the specified format + + Examples: + Minimal: {'cpd00027': 1000} + Bounds: {'cpd00027': (-10, 1000)} + Complete: {'cpd00027': {'id': 'cpd00027', 'lower_bound': -10, + 'upper_bound': 1000, 'concentration': 5.0, + 'name': 'Glucose'}} + """ + output = {} + + if output_type == "minimal": + for compound in self.mediacompounds: + output[compound.id] = compound.upper_bound + + elif output_type == "bounds": + for compound in self.mediacompounds: + output[compound.id] = (compound.lower_bound, compound.upper_bound) + + elif output_type == "complete": + for compound in self.mediacompounds: + output[compound.id] = { + 'id': compound.id, + 'lower_bound': compound.lower_bound, + 'upper_bound': compound.upper_bound, + 'concentration': compound.concentration, + 'name': compound.name + } + + else: + raise ValueError(f"Invalid output_type '{output_type}'. Must be 'minimal', 'bounds', or 'complete'.") + + return output + def get_media_constraints(self, cmp="e0"): """ Parameters: @@ -62,6 +167,60 @@ def get_media_constraints(self, cmp="e0"): media[met_id] = (compound.lower_bound, compound.upper_bound) return media + def find_mediacpd(self, cpd_id): + for cpd in self.mediacompounds: + if cpd.id == cpd_id: + return cpd + return None + + def add_compound(self, compound_id, lower_bound, upper_bound, concentration=None, name=None): + """ + Add a compound to the media. + + Parameters: + compound_id (str): The ID of the compound to add + lower_bound (float): Lower bound for the compound flux + upper_bound (float): Upper bound for the compound flux + concentration (float, optional): Concentration of the compound. Defaults to None. + name (str, optional): Name of the compound. Defaults to None. + + Returns: + MediaCompound: The newly created MediaCompound object + """ + new_compound = MediaCompound( + compound_id, + lower_bound, + upper_bound, + concentration=concentration, + name=name + ) + self.mediacompounds.append(new_compound) + return new_compound + + def remove_compounds(self, compound_ids): + """ + Remove compounds from the media by their IDs. + + Parameters: + compound_ids (list): List of compound IDs to remove from the media + + Returns: + list: List of removed MediaCompound objects + """ + removed_compounds = [] + compounds_to_keep = [] + + for compound in self.mediacompounds: + if compound.id in compound_ids: + removed_compounds.append(compound) + else: + compounds_to_keep.append(compound) + + # Replace the mediacompounds list with only the compounds to keep + self.mediacompounds = DictList(compounds_to_keep) + + return removed_compounds + def merge(self, media, overwrite_overlap=False): new_cpds = [] for cpd in media.mediacompounds: @@ -74,3 +233,27 @@ def merge(self, media, overwrite_overlap=False): else: new_cpds.append(newcpd) self.mediacompounds += new_cpds + + def copy(self): + """ + Create a deep copy of the MSMedia object. + + Returns: + MSMedia: A new MSMedia instance with copied media compounds + """ + new_media = MSMedia(self.id, name=self.name) + new_media.media_ref = self.media_ref + + # Copy all media compounds + copied_compounds = [] + for compound in self.mediacompounds: + copied_compound = MediaCompound( + compound.id, + compound.lower_bound, + compound.upper_bound, + concentration=compound.concentration + ) + copied_compounds.append(copied_compound) + + new_media.mediacompounds += copied_compounds + return new_media diff --git a/modelseedpy/core/msminimalmedia.py b/modelseedpy/core/msminimalmedia.py new file mode 100644 index 00000000..8c2f6e2d --- /dev/null +++ b/modelseedpy/core/msminimalmedia.py @@ -0,0 +1,696 @@ +from modelseedpy.core.exceptions import ObjectiveError, FeasibilityError +from modelseedpy.fbapkg.reactionusepkg import ReactionUsePkg +from modelseedpy.core.fbahelper import FBAHelper +from modelseedpy.core.msmodelutl import MSModelUtil +from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg +from itertools import combinations, permutations, chain +from optlang import Variable, Constraint +from cobra.medium import minimal_medium +from optlang.symbolics import Zero +from math import isclose, inf, factorial +#from deepdiff import DeepDiff +from time import process_time +from pprint import pprint +import logging +import json, re + +logger = logging.getLogger(__name__) + + +def _exchange_solution(sol_dict): + if isinstance(list(sol_dict.keys())[0], str): + return { + rxn: abs(flux) + for rxn, flux in sol_dict.items() + if "EX_" in rxn and flux < 0 + } + elif hasattr(list(sol_dict.keys())[0], "id"): + return { + rxn.id: abs(flux) + for rxn, flux in sol_dict.items() + if "EX_" in rxn.id and flux < 0 + } + return { + rxn.name: abs(flux) + for rxn, flux in sol_dict.items() + if "EX_" in rxn.name and flux < 0 + } + + +def _model_growth(sol_dict): + return sum( + [flux for var, flux in sol_dict.items() if re.search(r"(^bio\d+$)", var.name)] + ) + + +def _var_to_ID(var): + rxnID = var.name + if "_ru" in rxnID: + rxnID = rxnID.replace("_ru", "") + return rxnID + + +def _compatibilize(org_models, printing=False): + from commscores import GEMCompatibility + + return GEMCompatibility.standardize( + org_models, + conflicts_file_name="standardization_corrections.json", + printing=printing, + ) + + +def verify(org_model, min_media): + model2 = org_model.copy() + model2.medium = min_media + return model2.optimize() + + +def bioFlux_check(model, sol=None, sol_dict=None, min_growth=0.1): + sol_dict = sol_dict or FBAHelper.solution_to_variables_dict(sol, model) + # print({k:v for k,v in sol_dict.items() if v > 1E-8}) + simulated_growth = max( + sum( + [ + flux + for var, flux in sol_dict.items() + if re.search(r"(^bio\d+$)", var.name) + ] + ), + sol.objective_value, + ) + if simulated_growth < min_growth * 0.9999: + raise ObjectiveError( + f"The assigned minimal_growth of {min_growth} was not maintained during the simulation," + f" where the observed growth value was {simulated_growth}." + ) + if sol.status != "optimal": + display(sol) + return sol_dict + + +def minimizeFlux_withGrowth(model_util, min_growth, obj): + model_util.add_minimal_objective_cons(min_growth) + model_util.add_objective(obj, "min") + # print(model_util.model.objective) + # print([(cons.lb, cons.expression) for cons in model_util.model.constraints if "min" in cons.name]) + sol = model_util.model.optimize() + # print(sol.objective_value) + sol_dict = bioFlux_check(model_util.model, sol, min_growth=min_growth) + return sol, sol_dict + + +class MSMinimalMedia: + + @staticmethod + def _influx_objective(model_util, interacting): + rxns = ( + model_util.exchange_list() if interacting else model_util.transport_list() + ) + influxes = [] + for rxn in rxns: + if any( + ["e0" in met.id for met in rxn.reactants] + ): # this is essentially every exchange + influxes.append(rxn.reverse_variable) + elif any( + ["e0" in met.id for met in rxn.products] + ): # this captures edge cases or transporters + influxes.append(rxn.forward_variable) + else: + logger.critical( + f"The reaction {rxn} lacks exchange metabolites, which indicates an error." + ) + return influxes + + @staticmethod + def minimize_flux( + org_model, min_growth=None, environment=None, interacting=True, printing=True + ): + """minimize the total in-flux of exchange reactions in the model""" + if org_model.slim_optimize() == 0: + raise ObjectiveError( + f"The model {org_model.id} possesses an objective value of 0 in complete media, " + "which is incompatible with minimal media computations." + ) + model_util = MSModelUtil(org_model, True) + model_util.add_medium(environment or model_util.model.medium) + # define the MILP + min_growth = ( + model_util.model.slim_optimize() + if min_growth is None + else min(min_growth, model_util.model.slim_optimize()) + ) + # min_flux = MSMinimalMedia._min_consumption_objective(model_util, interacting) + media_exchanges = MSMinimalMedia._influx_objective(model_util, interacting) + # parse the minimal media + sol, sol_dict = minimizeFlux_withGrowth( + model_util, min_growth, sum(media_exchanges) + ) + min_media = _exchange_solution(sol_dict) + total_flux = sum([abs(flux) for flux in min_media.values()]) + simulated_sol = verify(org_model, min_media) + if simulated_sol.status != "optimal": + raise FeasibilityError( + f"The simulation was not optimal, with a status of {simulated_sol.status}" + ) + if printing: + print( + f"The minimal flux media for {org_model.id} consists of {len(min_media)} compounds and a {total_flux} total influx," + f" with a growth value of {simulated_sol.objective_value}" + ) + return min_media, sol + + @staticmethod + def _min_consumption_objective(model_util, interacting): + rxns = ( + model_util.exchange_list() if interacting else model_util.transport_list() + ) + vars = {} + for rxn in rxns: + cons_name = rxn.id + "_bin" + if cons_name in model_util.model.constraints: + print( + f"The {cons_name} constraint already exists in " + f"{model_util.model.id} and thus is skipped.\n" + ) + continue + + # define the variable + var_name = rxn.id + "_ru" + if var_name in model_util.model.variables: + print( + f"The {var_name} variable already exists in " + f"{model_util.model.id} and thus is skipped.\n" + ) + continue + vars[rxn.id] = Variable(var_name, lb=0, ub=1, type="binary") + model_util.add_cons_vars([vars[rxn.id]]) + # bin_flux: {rxn_bin}*1000 >= {rxn_rev_flux} + model_util.create_constraint( + Constraint(Zero, lb=0, ub=None, name=cons_name), + coef={vars[rxn.id]: 1000, rxn.reverse_variable: -1}, + ) + return vars + + @staticmethod + def conserved_exchanges(): + pass + + @staticmethod + def relative_media(): + pass + + @staticmethod + def minimize_components( + org_model, + min_growth=0.1, + environment=None, + interacting=True, + solution_limit=5, + printing=True, + ): + """minimize the quantity of metabolites that are consumed by the model""" + if org_model.slim_optimize() == 0: + raise ObjectiveError( + f"The model {org_model.id} possesses an objective value of 0 in complete media, " + "which is incompatible with minimal media computations." + ) + model_util = MSModelUtil(org_model, True) + model_util.add_timeout(10) + print("Minimal Components media") + if environment: + model_util.add_medium(environment) + # ic(org_model, min_growth, solution_limit) + model_util.add_minimal_objective_cons( + min_growth + ) # , model_util.model.reactions.bio1.flux_expression) + # print(model_util.model.constraints[-1]) + # define the binary variable and constraint + time1 = process_time() + variables = { + "ru": MSMinimalMedia._min_consumption_objective(model_util, interacting) + } + model_util.add_objective(sum(variables["ru"].values()), "min") + time2 = process_time() + print(f"\nDefinition of minimum objective time: {(time2 - time1)/60} mins") + + # determine each solution + # interdependencies = {} + solution_dicts, min_media = [], [0] * 1000 + sol = ( + model_util.model.optimize() + ) # TODO This is the troublesome line that occasionally refuses to solve + if "optimal" not in sol.status: + raise FeasibilityError( + f"The simulation for minimal uptake in {model_util.model.id} was {sol.status}." + ) + time3 = process_time() + broken = False + while ( + not broken + and sol.status == "optimal" + and len(solution_dicts) < solution_limit + ): + print(f"Iteration {len(solution_dicts)}", end="\r") + sol_dict = FBAHelper.solution_to_variables_dict(sol, model_util.model) + ## ensure that the minimal growth is respected + simulated_growth = _model_growth(sol_dict) + if simulated_growth < min_growth * 0.9999: + raise ObjectiveError( + f"The minimal growth of {min_growth} was not maintained; " + f"the simulation achieved {simulated_growth} growth." + ) + sol_rxns_dict = FBAHelper.solution_to_rxns_dict(sol, model_util.model) + solution_dicts.append(sol_dict) + sol_media = _exchange_solution(sol_rxns_dict) + min_media = sol_media if len(sol_media) < len(min_media) else min_media + ## omit the solution from future searches + model_util.create_constraint( + Constraint( + Zero, + lb=None, + ub=len(sol_dict) - 1, + name=f"exclude_sol{len(solution_dicts)}", + ), + sol_dict, + ) + + # search the permutation space by omitting previously investigated solution_dicts + # sol_exchanges = [rxn for rxn in sol_dict if "EX_" in rxn.name] + # interdependencies[count] = MSMinimalMedia._examine_permutations( + # model, sol_exchanges, variables, sol_dict, count, interacting) + try: + sol = model_util.model.optimize() + except: + broken = True + if broken: + break + if not solution_dicts: + raise FeasibilityError("The model was not feasibly simulated.") + min_media = {rxn: flux for rxn, flux in min_media.items()} + simulated_sol = verify(org_model, min_media) + if simulated_sol.status != "optimal": + raise FeasibilityError( + f"The predicted media {min_media} is not compatible with its model {org_model.id}, " + f"and possesses a(n) {simulated_sol.status} status." + ) + time6 = process_time() + print(f"Optimization time: {(time6-time3)/60} mins") + return min_media + + @staticmethod + def _knockout(org_model, rxnVar, variables, sol_dict, sol_index, interacting): + # knockout the specified exchange + knocked_model_utl = MSModelUtil(org_model, True) + knocked_model_utl, vars = MSMinimalMedia._min_consumption_objective( + knocked_model_utl, interacting + ) + coef = {rxnVar: 0} + if interacting: + coef.update( + { + variables["ru"][_var_to_ID(rxnVar2)]: 1 + for rxnVar2 in sol_dict + if rxnVar != rxnVar2 and "EX_" in rxnVar2.name + } + ) + else: + coef.update( + { + variables["ru"][_var_to_ID(rxnVar2)]: 1 + for rxnVar2 in sol_dict + if ( + rxnVar != rxnVar2 + and any(["_e0" in met.id for met in rxnVar2.metabolites]) + ) + } + ) + knocked_model_utl.create_constraint( + Constraint(Zero, lb=0.1, ub=None, name=f"{rxnVar.name}-sol{sol_index}"), + coef, + ) + return knocked_model_utl.optimize() + + @staticmethod + def _examine_permutations( + model, exchange_ids_to_explore, variables, sol_dict, sol_index, interacting + ): + for index, ex in enumerate(exchange_ids_to_explore): + print( + f"{ex.name}: {index}/{len(exchange_ids_to_explore)-1} exchanges to explore" + ) + sol_dict_sans_ex = sol_dict.copy() + sol_dict_sans_ex.pop(ex) + # interdependencies[sol_index][exID] = MSMinimalMedia._examine_permutations( + # exID, sol_dict, sol_index, variables, sol_dict_sans_ex) + interdependencies = {} + + ## explore permutations after removing the selected variable + diff = DeepDiff( + sol_dict_sans_ex, + FBAHelper.solution_to_dict( + MSMinimalMedia._knockout( + model, ex, variables, sol_dict, sol_index, interacting + ) + ), + ) + if ( + diff + ): # the addition of new exchanges or altered exchange fluxes are detected after the removed exchange + print(diff) + for key, changes in diff.items(): + # for change in changes: + # print(change) + changed_reactions = [ + re.search("(?<=\[')(.+)(?='\])", change).group() + for change in changes + ] + # this dictionary should be parsed into a list of substitute metabolites and a list of functionally coupled reactions + for exchange in [rxn for rxn in changed_reactions if "EX_" in rxn]: + interdependencies[sol_index][exchange] = ( + MSMinimalMedia._examine_permutations( + model, + exchange_ids_to_explore, + variables, + sol_dict, + sol_index + 1, + interacting, + ) + ) + # coef = {variables["met"][exID]: 0 for cpd in new_mets.keys()} + # coef.update({variables["met"][exID]: 1 for exID in sol_dict if exID not in new_mets.keys()}) + # cpd_name = "_".join(new_mets.keys()) + new_sol = model.optimize() + new_sol_dict = FBAHelper.solution_to_variables_dict(new_sol, model) + new_sol_exchanges = [rxn for rxn in sol_dict if "EX_" in rxn.name] + if new_sol.status != "optimal": + return interdependencies + MSMinimalMedia._examine_permutations( + model, + new_sol_exchanges, + variables, + new_sol_dict, + sol_index + 1, + interacting, + ) + return interdependencies + + @staticmethod + def determine_min_media( + model, + minimization_method="minFlux", + min_growth=None, + environment=None, + interacting=True, + solution_limit=5, + printing=True, + ): + if minimization_method == "minFlux": + return MSMinimalMedia.minimize_flux( + model, min_growth, environment, interacting, printing + ) + if minimization_method == "minComponents": + return minimal_medium(model, min_growth, minimize_components=True) + # return MSMinimalMedia.minimize_components( + # model, min_growth, environment, interacting, solution_limit, printing) + if minimization_method == "jenga": + return MSMinimalMedia.jenga_method(model, printing=printing) + + @staticmethod + def comm_media_est( + models, + comm_model, + minimization_method="minComponents", + min_growth=0.1, + environment=None, + interacting=True, + n_solutions=5, + printing=False, + ): + media = {"community_media": {}, "members": {}} + # print("com_media_est") + for org_model in models: + model_util = MSModelUtil(org_model, True) + # print(model_util.model.optimize()) + if environment: + # print(environment) + model_util.add_medium(environment) + # reactions = [rxn.name for rxn in model.variables] + # duplicate_reactions = DeepDiff(sorted(set(reactions)), sorted(reactions)) + # if duplicate_reactions: + # logger.critical(f'CodeError: The model {model.id} contains {duplicate_reactions}' + # f' that compromise the model.') + media["members"][model_util.model.id] = { + "media": MSMinimalMedia.determine_min_media( + model_util.model, + minimization_method, + min_growth, + environment, + interacting, + n_solutions, + printing, + ), + "solution": FBAHelper.solution_to_dict(model_util.model.optimize()), + } + if minimization_method == "jenga": + media["community_media"] = FBAHelper.sum_dict( + media["members"][model_util.model.id]["media"], + media["community_media"], + ) + if comm_model: + comm_util = MSModelUtil(comm_model) + if environment: + comm_util.add_medium(environment) + # if minimization_method == "jenga": + # print("Community models are too excessive for direct assessment via the JENGA method; " + # "thus, the community minimal media is estimated as the combination of member media.") + # return media + media["community_media"] = MSMinimalMedia.determine_min_media( + comm_model, + minimization_method, + min_growth, + environment, + interacting, + n_solutions, + printing, + ) + return media + + @staticmethod + def interacting_comm_media( + models, + comm_model, + minimization_method="jenga", + min_growth=0.1, + media=None, + environment=None, + printing=True, + ): + # define the community minimal media + media = media or MSMinimalMedia.comm_media_est( + models, + comm_model, + min_growth, + minimization_method, + environment, + printing=printing, + ) + org_media = media["community_media"].copy() + original_time = process_time() + # remove exchanges that can be satisfied by cross-feeding + for model in models: + for rxnID, flux in media["members"][model.id]["solution"].items(): + if ( + rxnID in media["community_media"] and flux > 0 + ): ## outflux in solutions + stoich = list( + model.reactions.get_by_id(rxnID).metabolites.values() + )[0] + media["community_media"][rxnID] += ( + flux * stoich + ) ## the cytoplasmic removal is captured by negative reactant stoich + media["community_media"] = { + ID: flux for ID, flux in media["community_media"].items() if flux > 0 + } # influx in media + syntrophic_diff = DeepDiff(org_media, media["community_media"]) + changed_quantity = ( + 0 + if not syntrophic_diff + else len(list(chain(*[v for v in list(dict(syntrophic_diff).values())]))) + ) + if printing: + print( + f"Syntrophic fluxes examined after {(process_time() - original_time) / 60} minutes, " + f"with {changed_quantity} change(s): {syntrophic_diff}" + ) + return media + + @staticmethod + def jenga_method( + org_model, + org_media=None, + conserved_cpds: list = None, + export=True, + printing=True, + compatibilize=False, + environment=None, + ): + # copy and compatibilize the parameter objects + if org_model.slim_optimize() == 0: + raise ObjectiveError( + f"The model {org_model.id} possesses an objective value of 0 in complete media, " + "which is incompatible with minimal media computations." + ) + copied_model = org_model.copy() + copied_model.medium = environment or copied_model.medium + if compatibilize: + copied_model = _compatibilize(copied_model) + original_media = org_media or MSMinimalMedia.minimize_components(copied_model) + # {cpd.replace("EX_", ""): flux for cpd, flux in .items()} + + # identify removal=ble compounds + original_time = process_time() + copied_model.medium = original_media + original_obj_value = org_model.optimize().objective_value + redundant_cpds = set() + for cpd in original_media: + new_media = original_media.copy() + new_media.pop(cpd) + copied_model.medium = new_media + sol_obj_val = copied_model.slim_optimize() + if isclose(sol_obj_val, original_obj_value, abs_tol=1e-4): + redundant_cpds.add(cpd) + else: + logger.debug( + f"The {sol_obj_val} objective value after the removal of {cpd} " + f"does not match the original objective value of {original_obj_value}." + ) + if not redundant_cpds: + logger.debug( + "None of the media components were determined to be removable." + ) + return original_media + if len(redundant_cpds) > 9: + import sigfig + + num_permuts = sigfig.round( + factorial(len(redundant_cpds)), sigfigs=2, format="sci" + ) + raise FeasibilityError( + f"The model {copied_model.id} contains {len(redundant_cpds)} removable" + f" compounds, which yields {num_permuts} permutations and is untenable for computation." + " Select a different minimal media method such as 'minFlux' or 'minComponents'." + ) + + # vet all permutation removals of the redundant compounds + permuts = [p for p in permutations(redundant_cpds)] + if printing: + print( + f"The {len(permuts)} permutations of the {redundant_cpds} redundant compounds, " + "from absolute tolerance of 1e-4, will be examined." + ) + permut_results, failed_permut_starts = [], [] + best = 0 + for perm_index, permut in enumerate(permuts): + print(f"{perm_index+1}/{len(permuts)}", end="\r") + successful_removal = 0 + permut_segments = [permut[:index] for index in range(len(permut), 2, -1)] + ## eliminate previously discovered failures and successes, respectively + if any([seg in failed_permut_starts for seg in permut_segments]): + continue + if best >= len(permut) / 2 and any( + [ + set(permut[: best - 1]) == set(list(success)[: best - 1]) + for success in permut_results + ] + ): + continue + new_media = original_media.copy() + for cpd in permut: + ### parameterize and simulate the community + new_media.pop(cpd) + copied_model.medium = new_media + sol = copied_model.optimize() + if not isclose(sol.objective_value, original_obj_value, abs_tol=1e-7): + failed_permut_starts.append(permut[: successful_removal + 1]) + break + successful_removal += 1 + + if successful_removal >= best: + if successful_removal > best: + best = successful_removal + permut_results = [] + permut_removable = permut[ + :best + ] # slice only the elements that are removable + if permut_removable not in permut_results: + permut_results.append(permut_removable) + if printing: + print(permut_removable) + print("best:", best) + + # filter to only the most minimal media + unique_combinations, unique_paths = [], [] + for removal_path in permut_results: + path_permutations = permutations(removal_path) + if all([path in permut_results for path in path_permutations]): + for com in combinations(removal_path, len(removal_path)): + com = set(com) + if com not in unique_combinations: + unique_combinations.append(com) + else: + unique_paths.append(removal_path) + if unique_combinations and printing: + print("Unique combinations:") + print(len(unique_combinations), unique_combinations) + if unique_paths and printing: + print("Unique paths:") + print(len(unique_paths), unique_paths) + + # further remove compounds from the media, while defaulting to the removal with the largest ID values + best_removals = {} + possible_removals = unique_combinations + unique_paths + if conserved_cpds: + possible_removals = [ + opt + for opt in possible_removals + if not any(cpd in conserved_cpds for cpd in opt) + ] + best = -inf + for removal in possible_removals: + cpdID_sum = sum( + [ + int(cpd.split("_")[1].replace("cpd", "") if "cpd" in cpd else 500) + for cpd in removal + ] + ) + if cpdID_sum > best: + best = cpdID_sum + best_removals = {best: [removal]} + elif cpdID_sum == best: + best_removals[best].append(removal) + ## arbitrarily select the first removal from those that both maximize the summed cpdID and avoid conserved compounds + media = FBAHelper.remove_media_compounds( + original_media, list(best_removals.values())[0][0], printing + ) + if printing: + print(best_removals) + pprint(media) + + # communicate results + jenga_media = media.copy() + jenga_difference = DeepDiff(original_media, jenga_media) + changed_quantity = ( + 0 if not jenga_difference else len(list(jenga_difference.values())[0]) + ) + if printing: + print( + f"Jenga fluxes examined after {(process_time()-original_time)/60} minutes, " + f"with {changed_quantity} change(s): {jenga_difference}" + ) + if export: + export_name = copied_model.id + "_media.json" + with open(export_name, "w") as out: + json.dump(media, out, indent=3) + return media diff --git a/modelseedpy/core/msmodel.py b/modelseedpy/core/msmodel.py old mode 100644 new mode 100755 index baaa0315..48c9f985 --- a/modelseedpy/core/msmodel.py +++ b/modelseedpy/core/msmodel.py @@ -1,10 +1,12 @@ # -*- coding: utf-8 -*- import logging import re -from cobra.core import Model -from pyeda.inter import ( - expr, -) # wheels must be specially downloaded and installed for Windows https://www.lfd.uci.edu/~gohlke/pythonlibs/#pyeda +from sympy.logic.inference import satisfiable +from sympy import Symbol +import sympy.logic.boolalg as spl +from cobra.core import Model, GPR + +# from pyeda.inter import expr logger = logging.getLogger(__name__) @@ -103,44 +105,63 @@ def get_cmp_token(compartments): return None -def get_set_set(expr_str): # !!! this currently returns dictionaries, not sets?? +def get_set_set_pyeda(expr_str: str, pyeda_expr): if len(expr_str.strip()) == 0: return {} expr_str = expr_str.replace(" or ", " | ") expr_str = expr_str.replace(" and ", " & ") - dnf = expr(expr_str).to_dnf() + dnf = pyeda_expr(expr_str).to_dnf() if len(dnf.inputs) == 1 or dnf.NAME == "And": return {frozenset({str(x) for x in dnf.inputs})} else: return {frozenset({str(x) for x in o.inputs}) for o in dnf.xs} +def get_set_set(expr_str: str): + if expr_str is None or len(expr_str.strip()) == 0: + return {} + gpr = GPR.from_string(expr_str) + expr = gpr.as_symbolic() + expr_model = list(satisfiable(expr, all_models=True)) + dnf = spl.SOPform(tuple(gpr.genes), list(expr_model)) + if type(dnf) == spl.And or type(dnf) == Symbol: + variable_set = set() + variable_set.add(frozenset({atom.name for atom in dnf.atoms()})) + return frozenset(variable_set) + elif type(dnf) == spl.Or: + return frozenset( + {frozenset({atom.name for atom in x.atoms()}) for x in dnf.args} + ) + else: + raise ValueError(f"unable to decode {expr_str} found token of type {type(dnf)}") + + class MSModel(Model): def __init__(self, id_or_model=None, genome=None, template=None): """ Class representation for a ModelSEED model. """ - super().__init__(self, id_or_model) + super().__init__(id_or_model) if genome: - self.genome_object = genome + self._genome = genome if template: - self.template_object = template + self._template = template @property def template(self): - return self.template_object + return self._template @template.setter def template(self, template): - self.template_object = template + self._template = template @property def genome(self): - return self.genome_object + return self._genome @genome.setter def genome(self, genome): - self.genome_object = genome + self._genome = genome def _set_genome_to_model(self, genome): # TODO: implement genome assignment checks if features matches genes diff --git a/modelseedpy/core/msmodelreport.py b/modelseedpy/core/msmodelreport.py new file mode 100644 index 00000000..2d980e38 --- /dev/null +++ b/modelseedpy/core/msmodelreport.py @@ -0,0 +1,636 @@ +# -*- coding: utf-8 -*- +import pandas as pd +import logging +import os +import re +import jinja2 +from os.path import dirname +from pandas.io.formats.style import Styler +from modelseedpy.core.msmodelutl import MSModelUtil + +module_path = dirname(os.path.abspath(__file__)) + +logger = logging.getLogger(__name__) +logger.setLevel( + logging.INFO +) # When debugging - set this to INFO then change needed messages below from DEBUG to INFO + + +class MSModelReport: + def __init__(self, model_or_mdlutl): + if isinstance(model_or_mdlutl, MSModelUtil): + self.model = model_or_mdlutl.model + self.modelutl = model_or_mdlutl + else: + self.model = model_or_mdlutl + self.modelutl = MSModelUtil.get(model_or_mdlutl) + + def generate_reports(self, report_path, multi_tab_report_path): + self.build_report(report_path) + self.build_multitab_report(multi_tab_report_path) + + # Helper function to build overview data + def build_overview_data(self): + # Get the number of compartments + number_compartments = len( + set([metabolite.compartment for metabolite in self.model.metabolites]) + ) + + # Extract gapfilling information + core_gapfilling_media = [] + gapfilling_media = [] + gf_sensitivity = self.modelutl.attributes.get("gf_sensitivity", None) + if gf_sensitivity: + for media in gf_sensitivity: + if ( + "bio1" in self.modelutl.attributes["gf_sensitivity"][media] + and "success" + in self.modelutl.attributes["gf_sensitivity"][media]["bio1"] + ): + gapfilling_media.append(media) + if ( + "rxn00062_c0" in self.modelutl.attributes["gf_sensitivity"][media] + and "success" + in self.modelutl.attributes["gf_sensitivity"][media]["rxn00062_c0"] + ): + core_gapfilling_media.append(media) + + # Count the number of gapfills + number_gapfills = len(gapfilling_media) + + # Convert the lists to strings + core_gapfilling_str = ( + "; ".join(core_gapfilling_media) + if core_gapfilling_media + else "No core gapfilling needed." + ) + gapfilling_media_str = ( + "; ".join(gapfilling_media) + if gapfilling_media + else "No genome-scale gapfilling." + ) + + overview = { + "Model ID": self.model.id, + "Full Gapfilling and ATP Analysis Report": "TBD", # You may replace 'TBD' with actual data when available + "Genome Scale Template": self.model.notes.get( + "kbase_template_refs", "Data Not Available" + ), + "Core Gapfilling Media": core_gapfilling_str, + "Gapfilling Media": gapfilling_media_str, + "Source Genome": self.model.notes.get( + "kbase_genome_ref", "Data Not Available" + ), + "Total Number of reactions": self.modelutl.nonexchange_reaction_count(), + "Number compounds": len(self.model.metabolites), + "Number compartments": number_compartments, + "Number biomass": len( + [ + rxn + for rxn in self.model.reactions + if rxn.annotation.get("sbo") == "SBO:0000629" + ] + ), + "Number gapfills": number_gapfills, + } + return overview + + # Helper function for extracting gapfilling data + def extract_gapfilling_data(self, gf_sensitivity): + if gf_sensitivity is None: + return [], {} + + gapfilling_dict = {} + gapfilling_summary = {} + + for media, media_data in gf_sensitivity.items(): + for target, target_data in media_data.items(): + gf_data = target_data.get("success", {}) + if isinstance(gf_data, dict): + for reaction_id, reaction_data in gf_data.items(): + for direction, metabolites in reaction_data.items(): + # If metabolites is None, set to empty string + if metabolites is None: + metabolites = "" + + # Extract both IDs and Names for Gapfilling Sensitivity + sensitivity_ids = [] + sensitivity_names = [] + if isinstance(metabolites, (list, tuple)): + for met_id in metabolites: + sensitivity_ids.append(met_id) + met_name = ( + self.model.metabolites.get_by_id(met_id).name + if met_id in self.model.metabolites + else met_id + ) + sensitivity_names.append(met_name) + else: + metabolites = str(metabolites) + entry = { + "reaction_id": reaction_id, + "reaction_name": self.model.reactions.get_by_id( + reaction_id + ).name + if reaction_id in self.model.reactions + else reaction_id, + "media": media, + "direction": direction, + "target": target, + "gapfilling_sensitivity_id": "; ".join(sensitivity_ids) + if sensitivity_ids + else metabolites, + "gapfilling_sensitivity_name": "; ".join( + sensitivity_names + ) + if sensitivity_names + else metabolites, + } + + # Update the summary dictionary + if reaction_id not in gapfilling_summary: + gapfilling_summary[reaction_id] = [] + gapfilling_summary[reaction_id].append( + f"{media}: {direction}" + ) + + # Check if reaction_id is already in dictionary + if reaction_id in gapfilling_dict: + # Update the media + existing_entry = gapfilling_dict[reaction_id] + existing_media = existing_entry["media"].split("; ") + if media not in existing_media: + existing_media.append(media) + existing_entry["media"] = "; ".join(existing_media) + else: + gapfilling_dict[reaction_id] = entry + + return list(gapfilling_dict.values()), gapfilling_summary + + # transform data to be used in tabular format to use in build_model_report + def transform_gapfilling_data(self, gapfilling_data): + transformed_data = [] + for entry in gapfilling_data: + row = [ + entry["reaction_id"], + entry["reaction_name"], + entry["media"], + entry["direction"], + entry["target"], + entry["gapfilling_sensitivity_id"], + entry["gapfilling_sensitivity_name"], + ] + transformed_data.append(row) + return transformed_data + + # Extract ATP analysis data + def extract_atp_analysis_data(self, atp_analysis, atp_expansion_filter): + entries = [] + if atp_analysis and "core_atp_gapfilling" in atp_analysis: + for media, data in atp_analysis["core_atp_gapfilling"].items(): + score = data.get("score", None) + new_reactions = [ + "{}: {}".format(k, v) for k, v in data.get("new", {}).items() + ] + reversed_reactions = [ + "{}: {}".format(k, v) for k, v in data.get("reversed", {}).items() + ] + atp_production = "Not integrated" + if ( + "selected_media" in atp_analysis + and media in atp_analysis["selected_media"] + ): + atp_production = atp_analysis["selected_media"][media] + + # Extracting the "Filtered Reactions" in the required format + filtered_reactions = [] + for k, v in atp_expansion_filter.get(media, {}).items(): + if isinstance(v, dict): + for sub_k, sub_v in v.items(): + if isinstance(sub_v, dict): + for reaction, direction_dict in sub_v.items(): + direction = list(direction_dict.keys())[0] + filtered_reactions.append( + f"{reaction}: {direction}" + ) + filtered_reactions_str = "; ".join(filtered_reactions) + + if score is not None: + entries.append( + { + "media": media, + "no_of_gapfilled_reactions": score, + "atp_production": atp_production, + "gapfilled_reactions": "; ".join(new_reactions), + "reversed_reaction_by_gapfilling": "; ".join( + reversed_reactions + ), + "filtered_reactions": filtered_reactions_str, + } + ) + # Sorting the entries based on the 'no_of_gapfilled_reactions' column + entries.sort(key=lambda x: x["no_of_gapfilled_reactions"]) + return entries + + # Extract ATP production data for the ATP Analysis tab + def extract_atp_production_data(self, atp_analysis): + atp_production_dict = {} + if atp_analysis: + selected_media = atp_analysis.get("selected_media", {}) + core_atp_gapfilling = atp_analysis.get("core_atp_gapfilling", {}) + + # First, process selected_media + for media, value in selected_media.items(): + atp_production_dict[media] = round(value, 2) + + # Next, process core_atp_gapfilling for media not in selected_media + for media, data in core_atp_gapfilling.items(): + if media not in atp_production_dict: + if data.get("failed"): + atp_production_dict[media] = "failed" + else: + # If the media was not processed in selected_media and it's not failed, set as 'Not Integrated' + atp_production_dict[media] = "Not Integrated" + + return atp_production_dict + + def build_multitab_report(self, output_path): + + # Build overview data + overview_data = self.build_overview_data() + + # Get gf_sensitivity attribute from the model + gf_sensitivity = self.modelutl.attributes.get("gf_sensitivity", None) + + # Extract gapfilling data + gapfilling_entries, gapfilling_reaction_summary = self.extract_gapfilling_data( + gf_sensitivity + ) + + # Check if ATP_analysis attribute is present in the model + atp_analysis = self.modelutl.attributes.get("ATP_analysis", None) + if atp_analysis: + atp_expansion_filter = self.modelutl.attributes.get( + "atp_expansion_filter", {} + ) + atp_analysis_entries = self.extract_atp_analysis_data( + atp_analysis, atp_expansion_filter + ) + else: + atp_analysis_entries = [] + + # Initialize context dictionary + context = { + "overview": overview_data, + "reactions": [], + "compounds": [], + "genes": [], + "biomass": [], + "gapfilling": gapfilling_entries, # Populated with gapfilling data + "atpanalysis": atp_analysis_entries, # Populated with ATP analysis data + } + + print("Module Path:", module_path + "/../data/") + + exchanges = {r.id for r in self.model.exchanges} + + # Identify biomass reactions using SBO annotation + biomass_reactions_ids = { + rxn.id + for rxn in self.model.reactions + if rxn.annotation.get("sbo") == "SBO:0000629" + } + + # Reactions Tab + for rxn in self.model.reactions: + if rxn.id not in exchanges and rxn.id not in biomass_reactions_ids: + equation = rxn.build_reaction_string(use_metabolite_names=True) + rxn_data = { + "id": rxn.id, + "name": rxn.name, + "equation": equation, + "genes": rxn.gene_reaction_rule, + "gapfilling": "; ".join( + gapfilling_reaction_summary.get(rxn.id, []) + ), # Empty list results in an empty string + } + context["reactions"].append(rxn_data) + + # Compounds Tab + for cpd in self.model.metabolites: + cpd_data = { + "id": cpd.id, + "name": cpd.name, + "formula": cpd.formula, + "charge": cpd.charge, + "compartment": cpd.compartment, + } + context["compounds"].append(cpd_data) + + # Genes Tab + for gene in self.model.genes: + gene_data = { + "gene": gene.id, + "reactions": "; ".join([rxn.id for rxn in gene.reactions]), + } + context["genes"].append(gene_data) + + # Biomass Tab + if biomass_reactions_ids: + for biomass_rxn_id in biomass_reactions_ids: + biomass_rxn = self.model.reactions.get_by_id(biomass_rxn_id) + for metabolite, coefficient in biomass_rxn.metabolites.items(): + compound_id = metabolite.id + compound_name = metabolite.name.split("_")[0] + compartment = compound_id.split("_")[-1] + + biomass_data = { + "biomass_reaction_id": biomass_rxn.id, + "biomass_compound_id": compound_id, + "name": compound_name, + "coefficient": coefficient, + "compartment": compartment, + } + context["biomass"].append(biomass_data) + else: + print("No biomass reactions found in the model.") + + # Gapfilling Tab + gf_sensitivity = self.modelutl.attributes.get("gf_sensitivity", None) + gapfilling_data = self.extract_gapfilling_data(gf_sensitivity) + context["gapfilling"] = gapfilling_entries + + # Extract ATP Production Data + atp_production_data = self.extract_atp_production_data(atp_analysis) + + # Populate the 'atpanalysis' context with ATP production data + for entry in context["atpanalysis"]: + media = entry["media"] + entry["atp_production"] = atp_production_data.get(media, None) + + # Diagnostics + unique_biomass_rxns = biomass_reactions_ids + print(f"Unique biomass reactions identified: {len(unique_biomass_rxns)}") + print(f"Biomass Reaction IDs: {', '.join(unique_biomass_rxns)}") + + print("\nFirst 2 reactions:") + for rxn in context["reactions"][:2]: + print(rxn) + + print("\nFirst 2 compounds:") + for cpd in context["compounds"][:2]: + print(cpd) + + print("\nFirst 2 genes:") + for gene in context["genes"][:2]: + print(gene) + + print("\nFirst 2 biomass compounds:") + for bm in context["biomass"][:2]: + print(bm) + + print("\nFirst 2 gapfilling entries:") + for gf in context["gapfilling"][:2]: + print(gf) + + print("\nFirst 2 ATP Analysis entries:") + for entry in context["atpanalysis"][:2]: + print(entry) + + # Render with template + env = jinja2.Environment( + loader=jinja2.FileSystemLoader(module_path + "/../data/"), + autoescape=jinja2.select_autoescape(["html", "xml"]), + ) + html = env.get_template("ModelReportTemplate.html").render(context) + directory = dirname(output_path) + os.makedirs(directory, exist_ok=True) + with open(output_path, "w") as f: + f.write(html) + + def build_report(self, output_path): + """Builds model HTML report for the Model Summary table + Parameters + ---------- + model : cobra.Model + Model to use to build the report + """ + + # 1. Utilize the build_overview_data method + model_summary_data = self.build_overview_data() + # Remove the unwanted entry + model_summary_data.pop("Full Gapfilling and ATP Analysis Report", None) + # 2. Transform the dictionary into a list of tuples + model_summary_list = [(key, value) for key, value in model_summary_data.items()] + # 3. Convert to DataFrame + model_summary_df = pd.DataFrame(model_summary_list, columns=["", ""]) + + # Style the DataFrame (as was done previously) + model_summary_df_styled = model_summary_df.style.hide( + axis="index" + ).set_table_styles( + [ + { + "selector": "th", + "props": [ + ("border", "none"), + ("background-color", "white"), + ("font-family", "Oxygen"), + ("font-size", "14px"), + ("line-height", "20px"), + ], + }, + { + "selector": "td", + "props": [ + ("border", "none"), + ("font-family", "Oxygen"), + ("font-size", "14px"), + ("line-height", "20px"), + ], + }, + { + "selector": "tr:nth-child(even)", + "props": [("background-color", "white")], + }, + { + "selector": "tr:nth-child(odd)", + "props": [("background-color", "#f2f2f2")], + }, + ] + ) + + # Fetching the gapfilling sensitivity data + gf_sensitivity = self.modelutl.attributes.get("gf_sensitivity", None) + gapfilling_data = self.extract_gapfilling_data(gf_sensitivity) + gapfilling_list = self.transform_gapfilling_data(gapfilling_data[0]) + + # Convert the gapfilling_list to a DataFrame + gapfillings_analysis_df = pd.DataFrame( + gapfilling_list, + columns=[ + "Reaction ID", + "Reaction Name", + "Media", + "Direction", + "Target", + "Gapfilling Sensitivity ID", + "Gapfilling Sensitivity Name", + ], + ) + + # Apply style to Gapfillings Analysis DataFrame + gapfillings_analysis_df_styled = gapfillings_analysis_df.style.hide( + axis="index" + ).set_table_styles( + [ + { + "selector": "th", + "props": [ + ("border", "none"), + ("background-color", "white"), + ("font-family", "Oxygen"), + ("font-size", "14px"), + ("line-height", "20px"), + ], + }, + { + "selector": "td", + "props": [ + ("border", "none"), + ("font-family", "Oxygen"), + ("font-size", "14px"), + ("line-height", "20px"), + ], + }, + { + "selector": "tr:nth-child(even)", + "props": [("background-color", "white")], + }, + { + "selector": "tr:nth-child(odd)", + "props": [("background-color", "#f2f2f2")], + }, + ] + ) + + # Legend for Gapfillings Analysis + annotations_text_gapfillings = """ +
    +
  • Reaction ID: The identifier of the reaction.
  • +
  • Reaction Name: The name of the reaction.
  • +
  • Media: The media used by gap filling.
  • +
  • Direction: The direction of the reaction. Can be ">" for forward, "<" for reverse, or "=" for both directions.
  • +
  • Target: The reaction selected as the objective function target for the gapfilling optimization problem. Targets here can be the model’s biomass reaction, commonly named “bio1” for models created by this app. + Alternatively, “rxn00062” (ATP Production) reaction is shown for cases where gapfilling was applied to guarantee ATP production in a given media. + When reactions are gapfilled for ATP production, we recommend checking the full Core ATP Analysis in the table below.
  • +
  • Gapfilling Sensitivity ID and Name: Gapfilling is necessary when compounds in the biomass objective function can not be produced by the model. + For each reaction we list the biomass compound(s) that can not be synthesized by the model without gapfilling. + In cases where gap filling fails there are two possible scenarios: + 1) FBF (failed before filtering) : the gapfilling immediately failed, even before we filtered out the ATP breaking reactions. This means this objective CANNOT be satisfied with the entire current database. + 2) FAF (failed after filtering): the gapfilling succeeded before filtering, but failed after filtering out reactions that break ATP. This tells you definitively if the ATP filtering caused the gapfilling to fail
  • +
+ """ + + # Extract ATP analysis data + atp_analysis = self.modelutl.attributes.get("ATP_analysis", None) + atp_expansion_filter = self.modelutl.attributes.get("atp_expansion_filter", {}) + atp_analysis_entries = self.extract_atp_analysis_data( + atp_analysis, atp_expansion_filter + ) + + # Convert the atp_analysis_entries list to a DataFrame + atp_analysis_df = pd.DataFrame(atp_analysis_entries) + + # Apply style to ATP Analysis DataFrame + atp_analysis_df_styled = atp_analysis_df.style.hide( + axis="index" + ).set_table_styles( + [ + { + "selector": "th", + "props": [ + ("border", "none"), + ("background-color", "white"), + ("font-family", "Oxygen"), + ("font-size", "14px"), + ("line-height", "20px"), + ], + }, + { + "selector": "td", + "props": [ + ("border", "none"), + ("font-family", "Oxygen"), + ("font-size", "14px"), + ("line-height", "20px"), + ], + }, + { + "selector": "tr:nth-child(even)", + "props": [("background-color", "white")], + }, + { + "selector": "tr:nth-child(odd)", + "props": [("background-color", "#f2f2f2")], + }, + ] + ) + + # Legend for ATP Analysis + annotations_text_atp_analysis = """ +
    +
  • No. of gapfilled reactions: The number of reactions filled by the gapfilling process.
  • +
  • Media: The media in which the reaction takes place.
  • +
  • ATP Production: ATP production by the core metabolism model.
  • +
  • Gapfilled Reactions: Reactions added during the gapfilling process.
  • +
  • Reversed Reaction by Gapfilling: Reactions that have been reversed during the gapfilling process.
  • +
  • Filtered Reactions: Reactions that have been filtered out during the analysis. When a reaction addition would lead to a large increase in ATP production or an infinite energy loop, we filter that reaction out of the gapfilling database and prevent it from being added to the model.
  • +
+ """ + + # ATP analysis explanation text + explanation_text_atp_analysis = """ +

During model reconstruction, we analyze the genome’s core metabolism draft model (model without gapfilling) to assess energy biosynthesis capabilities. + The goal of this analysis is to ensure the core metabolism model is able to produce ATP before we expand the model to the genome-scale. + This step is designed to prevent gapfilling from introducing reactions that create energy-generating loops. + The tests are conducted on a large collection of minimal conditions, with the goal of simulating the model’s capability to produce energy with different electron donor, electron acceptor, and carbon source combinations.

+

When the draft model of the core metabolism is capable of producing ATP in at least one of the test media, no gapfilling reactions part of this analysis will be added to the model. While we still report the gapfilling requirements for the test media formulations that fail to produce ATP with that draft core model, we only integrate these solutions in the model when no test media succeeds in producing ATP. + In this case, the integrated gap-filling solution(s) will be displayed in the “Gapfilling Analysis” table above, with the “Target” “rxn00062” (ATP Production) objective function.

+

The goal is to display the test results for all media to provide clues for the metabolic capabilities of the genome(s). When many reactions are required for growth on the SO4 testing media conditions, this could be a good indicator that the organism is not capable of performing sulfate reduction. + On the other hand, when only one gapfill reaction is required for ATP production in a given media, multiple scenarios can be considered. + 1) Organism(s) can’t grow on test condition, and we correctly did not add the reaction to the model. 2) Possible issue with the source genome annotation missing a specific gene function 3) Possible issue with the model reconstruction database. We hope this data helps make more informed decisions on reactions that may need to be manually curated in the model. + In cases where is known from the literature or unpublished experimental results that an organism is capable of producing ATP in a given media condition that requires gapfilling in this analysis, you can use the parameter “Force ATP media” in the reconstruction app to ensure those reactions are integrated into the model. + .

+ """ + + # Save the data to HTML with the styled DataFrames and the legends + directory = os.path.dirname(output_path) + os.makedirs(directory, exist_ok=True) + with open(output_path, "w", encoding="utf-8") as f: + f.write('') + f.write("

Model Summary

") + f.write(model_summary_df_styled.to_html(escape=False)) + f.write("

") + f.write("

Gapfillings Analysis

") + + # Check for Gapfillings Analysis data + if not gapfillings_analysis_df.empty: + f.write(gapfillings_analysis_df_styled.to_html(escape=False)) + f.write(f"

Legend:

{annotations_text_gapfillings}") + else: + f.write( + "

Warning: No Gapfillings Analysis data available for this model.

" + ) + + f.write("

Core ATP Analysis

") + + # Check for ATP Analysis data + if not atp_analysis_df.empty: + f.write(atp_analysis_df_styled.to_html(escape=False)) + f.write(f"

Legend:

{annotations_text_atp_analysis}") + f.write(explanation_text_atp_analysis) + else: + f.write( + "

Warning: No Core ATP Analysis data available for this model.

" + ) diff --git a/modelseedpy/core/msmodelutl.py b/modelseedpy/core/msmodelutl.py old mode 100644 new mode 100755 index c9f5996a..8d9d595e --- a/modelseedpy/core/msmodelutl.py +++ b/modelseedpy/core/msmodelutl.py @@ -2,98 +2,569 @@ import logging import re import time +import json +import sys +import pandas as pd +import cobra from cobra import Model, Reaction, Metabolite +from optlang.symbolics import Zero +from cobra.flux_analysis import pfba from modelseedpy.fbapkg.mspackagemanager import MSPackageManager +from modelseedpy.biochem.modelseed_biochem import ModelSEEDBiochem +from modelseedpy.core.fbahelper import FBAHelper +from multiprocessing import Value + +# from builtins import None logger = logging.getLogger(__name__) +logger.setLevel( + logging.INFO +) # When debugging - set this to INFO then change needed messages below from DEBUG to INFO +core_rxns = { + "rxn00994_c0": "<", + "rxn00151_c0": ">", + "rxn24606_c0": ">", + "rxn00161_c0": ">", + "rxn14426_c0": ">", + "rxn00762_c0": "=", + "rxn05145_c0": ">", + "rxn00871_c0": ">", + "rxn01236_c0": "<", + "rxn05226_c0": ">", + "rxn01116_c0": "=", + "rxn00251_c0": "=", + "rxn05602_c0": "=", + "rxn09001_c0": ">", + "rxn00995_c0": ">", + "rxn14419_c0": ">", + "rxn14420_c0": ">", + "rxn24607_c0": "=", + "rxn00324_c0": "<", + "rxn01334_c0": "=", + "rxn05209_c0": "=", + "rxn00611_c0": "=", + "rxn00544_c0": "<", + "rxn01121_c0": ">", + "rxn03249_c0": "=", + "rxn00392_c0": "=", + "rxn05581_c0": "=", + "rxn00990_c0": ">", + "rxn00985_c0": "=", + "sul00004_c0": "=", + "rxn00160_c0": ">", + "rxn00615_c0": ">", + "rxn09003_c0": ">", + "rxn00083_c0": ">", + "rxn05493_c0": "=", + "rxn00248_c0": "=", + "rxn00678_c0": "=", + "rxn00558_c0": "=", + "rxn02376_c0": "=", + "rxn24608_c0": ">", + "rxn14424_c0": ">", + "rxn09174_c0": "=", + "rxn03250_c0": "=", + "rxn00162_c0": ">", + "rxn00549_c0": ">", + "rxn00779_c0": ">", + "rxn05573_c0": ">", + "rxn00506_c0": ">", + "rxn14425_c0": ">", + "rxn01872_c0": "=", + "rxn01996_c0": "=", + "rxn00507_c0": ">", + "rxn08528_c0": "=", + "rxn24609_c0": "=", + "rxn03884_c0": ">", + "rxn05488_c0": "=", + "rxn03079_c0": "=", + "rxn24610_c0": "=", + "rxn00178_c0": ">", + "rxn08793_c0": ">", + "rxn01130_c0": ">", + "rxn00512_c0": "<", + "rxn08355_c0": ">", + "rxn02342_c0": ">", + "rxn02314_c0": "=", + "rxn39373_c0": "=", + "rxn31759_c0": "=", + "rxn11937_c0": "<", + "rxn46184_c0": "=", + "rxn01123_c0": ">", + "rxn14421_c0": ">", + "rxn00379_c0": ">", + "rxn08734_c0": ">", + "rxn00668_c0": "=", + "rxn14418_c0": ">", + "rxn10570_c0": "=", + "rxn05553_c0": ">", + "rxn09295_c0": ">", + "rxn05759_c0": "=", + "rxn01343_c0": ">", + "rxn00545_c0": ">", + "rxn00250_c0": "=", + "rxn00785_c0": "=", + "rxn00305_c0": ">", + "rxn01387_c0": "=", + "rxn00974_c0": "=", + "rxn00604_c0": ">", + "rxn00875_c0": ">", + "rxn05528_c0": ">", + "rxn00623_c0": "<", + "rxn13974_c0": "<", + "rxn00770_c0": "=", + "rxn08900_c0": ">", + "rxn05468_c0": ">", + "rxn00199_c0": ">", + "rxn00499_c0": "=", + "rxn06493_c0": "=", + "rxn01275_c0": ">", + "rxn14412_c0": ">", + "rxn01106_c0": "=", + "rxn08428_c0": "=", + "rxn00777_c0": "=", + "rxn03644_c0": "=", + "rxn14414_c0": ">", + "rxn01480_c0": "=", + "rxn06526_c0": "=", + "rxn00543_c0": "=", + "rxn01115_c0": ">", + "rxn01870_c0": "=", + "rxn00677_c0": "=", + "rxn00799_c0": "=", + "rxn08975_c0": ">", + "rxn03240_c0": "=", + "rxn05312_c0": "<", + "rxn08558_c0": ">", + "sul00008_c0": ">", + "rxn01187_c0": ">", + "rxn00171_c0": "=", + "rxn15383_c0": ">", + "rxn00224_c0": "=", + "rxn03127_c0": "=", + "rxn01834_c0": "=", + "rxn24613_c0": "=", + "rxn14428_c0": "<", + "rxn08689_c0": "=", + "rxn02527_c0": ">", + "rxn00336_c0": ">", + "rxn05040_c0": ">", + "rxn08783_c0": ">", + "rxn14427_c0": ">", + "rxn00616_c0": "=", + "rxn05313_c0": ">", + "rxn03020_c0": "=", + "rxn11322_c0": "=", + "rxn00206_c0": "<", + "rxn09167_c0": ">", + "rxn10122_c0": ">", + "rxn00763_c0": "=", + "rxn06299_c0": "=", + "rxn05561_c0": "=", + "rxn08966_c0": "=", + "rxn10471_c0": "=", + "rxn15962_c0": "<", + "rxn00786_c0": "=", + "rxn00157_c0": "<", + "rxn00216_c0": "=", + "rxn00077_c0": "=", + "rxn01241_c0": "=", + "rxn01100_c0": "=", + "rxn00748_c0": ">", + "rxn00935_c0": "=", + "rxn00548_c0": "=", + "rxn08557_c0": ">", + "rxn05466_c0": "=", + "rxn08655_c0": ">", + "rxn00441_c0": ">", + "rxn01476_c0": ">", + "rxn02168_c0": "=", + "rxn00569_c0": "<", + "rxn17445_c0": ">", + "rxn01274_c0": ">", + "rxn00006_c0": "<", + "rxn08792_c0": ">", + "rxn08691_c0": "=", + "sul00003_c0": "=", + "rxn04794_c0": "=", + "rxn00568_c0": "<", + "rxn00225_c0": "=", + "rxn09318_c0": "=", + "rxn01057_c0": "=", + "rxn00247_c0": ">", + "rxn00285_c0": "=", + "rxn09004_c0": "=", + "rxn24612_c0": "=", + "rxn00371_c0": ">", + "rxn00159_c0": ">", + "rxn01333_c0": "=", + "rxn01388_c0": "=", + "rxn02480_c0": "=", + "rxn02167_c0": ">", + "rxn08971_c0": ">", + "rxn00612_c0": "=", + "rxn01806_c0": ">", + "rxn00148_c0": "<", + "rxn00122_c0": ">", + "rxn05469_c0": "=", + "rxn00265_c0": ">", + "rxn00330_c0": "<", + "rxn00602_c0": "<", + "rxn08179_c0": ">", + "rxn09269_c0": ">", + "rxn01200_c0": "=", + "rxn08556_c0": ">", + "rxn05627_c0": ">", + "rxn08656_c0": ">", + "rxn00097_c0": "=", + "rxn05319_c0": "=", + "rxn03085_c0": "=", + "rxn08178_c0": ">", + "rxn00747_c0": "=", + "rxn05559_c0": "=", + "rxn09314_c0": ">", + "rxn15961_c0": "=", + "rxn08976_c0": ">", + "rxn00172_c0": "<", + "rxn00868_c0": "<", + "rxn08173_c0": "=", + "rxn00102_c0": "=", + "rxn09272_c0": ">", + "rxn03126_c0": "=", + "sul00002_c0": "=", + "rxn01871_c0": "<", + "rxn00500_c0": "=", + "rxn00175_c0": ">", + "rxn00459_c0": "=", + "rxn24611_c0": "=", + "rxn09008_c0": "=", + "rxn00173_c0": "=", + "rxn33011_c0": "=", + "rxn08901_c0": ">", + "rxn00782_c0": "<", + "rxn03643_c0": "=", + "rxn08527_c0": "=", + "rxn00869_c0": "<", + "rxn05651_c0": "=", + "rxn10126_c0": ">", + "rxn00874_c0": "=", + "rxn10577_c0": ">", + "rxn00001_c0": ">", + "sul00010_c0": ">", + "rxn05625_c0": "=", + "rxn00670_c0": "=", + "rxn00147_c0": ">", + "rxn00288_c0": ">", + "rxn06777_c0": "=", + "rxn01452_c0": "<", + "rxn08518_c0": ">", + "rxn14422_c0": ">", + "rxn01477_c0": ">", + "rxn08350_c0": "=", + "rxn00256_c0": "<", + "rxn08977_c0": ">", + "rxn00781_c0": "=", + "rxn05467_c0": "=", + "rxn00011_c0": "<", + "rxn39175_c0": "=", + "rxn14423_c0": ">", + "rxn40505_c0": "=" +} -def metabolite_msid(metabolite): - if re.search("^(cpd\d+)", metabolite.id): - m = re.search("^(cpd\d+)", metabolite.id) - return m[1] - for anno in metabolite.annotation: - if isinstance(metabolite.annotation[anno], list): - for item in metabolite.annotation[anno]: - if re.search("^(cpd\d+)", item): - m = re.search("^(cpd\d+)", item) - return m[1] - elif re.search("^(cpd\d+)", metabolite.annotation[anno]): - m = re.search("^(cpd\d+)", metabolite.annotation[anno]) - return m[1] - return None - - -def reaction_msid(reaction): - if re.search("^(rxn\d+)", reaction.id): - m = re.search("^(rxn\d+)", reaction.id) - return m[1] - for anno in reaction.annotation: - if isinstance(reaction.annotation[anno], list): - for item in reaction.annotation[anno]: - if re.search("^(rxn\d+)", item): - m = re.search("^(rxn\d+)", item) - return m[1] - elif re.search("^(rxn\d+)", reaction.annotation[anno]): - m = re.search("^(rxn\d+)", reaction.annotation[anno]) +class MSModelUtil: + mdlutls = {} + + @staticmethod + def metabolite_msid(metabolite): + if re.search("^(cpd\d+)", metabolite.id): + m = re.search("^(cpd\d+)", metabolite.id) return m[1] - return None + for anno in metabolite.annotation: + if isinstance(metabolite.annotation[anno], list): + for item in metabolite.annotation[anno]: + if re.search("^(cpd\d+)", item): + m = re.search("^(cpd\d+)", item) + return m[1] + elif re.search("^(cpd\d+)", metabolite.annotation[anno]): + m = re.search("^(cpd\d+)", metabolite.annotation[anno]) + return m[1] + return None + @staticmethod + def reaction_msid(reaction): + if re.search("^(rxn\d+)", reaction.id): + m = re.search("^(rxn\d+)", reaction.id) + return m[1] + for anno in reaction.annotation: + if isinstance(reaction.annotation[anno], list): + for item in reaction.annotation[anno]: + if re.search("^(rxn\d+)", item): + m = re.search("^(rxn\d+)", item) + return m[1] + elif re.search("^(rxn\d+)", reaction.annotation[anno]): + m = re.search("^(rxn\d+)", reaction.annotation[anno]) + return m[1] + return None -def stoichiometry_to_string(stoichiometry): - reactants = [] - products = [] - for met in stoichiometry: - coef = stoichiometry[met] - if not isinstance(met, str): - if metabolite_msid(met) == "cpd00067": - met = None - else: - met = met.id - if met != None: - if coef < 0: - reactants.append(met) - else: - products.append(met) - reactants.sort() - products.sort() - return [ - "+".join(reactants) + "=" + "+".join(products), - "+".join(products) + "=" + "+".join(reactants), - ] + @staticmethod + def stoichiometry_to_string(stoichiometry): + reactants = [] + products = [] + for met in stoichiometry: + coef = stoichiometry[met] + if not isinstance(met, str): + if MSModelUtil.metabolite_msid(met) == "cpd00067": + met = None + else: + met = met.id + if met != None: + if coef < 0: + reactants.append(met) + else: + products.append(met) + reactants.sort() + products.sort() + return [ + "+".join(reactants) + "=" + "+".join(products), + "+".join(products) + "=" + "+".join(reactants), + ] + @staticmethod + def search_name(name): + name = name.lower() + name = re.sub(r"_[a-z]\d*$", "", name) + name = re.sub(r"\W+", "", name) + return name -def search_name(name): - name = name.lower() - name = re.sub(r"_[a-z]\d*$", "", name) - name = re.sub(r"\W+", "", name) - return name + @staticmethod + def get(model, create_if_missing=True): + if isinstance(model, MSModelUtil): + return model + if model in MSModelUtil.mdlutls: + return MSModelUtil.mdlutls[model] + elif create_if_missing: + MSModelUtil.mdlutls[model] = MSModelUtil(model) + return MSModelUtil.mdlutls[model] + else: + return None + + @staticmethod + def build_from_kbase_json_file(filename, kbaseapi): + """ + Builds an MSModelUtil object from a KBase JSON file. + + Args: + filename (str): The path to the KBase JSON file. + kbaseapi (KBaseAPI): An instance of the KBase API. + Returns: + An MSModelUtil object representing the contents of the KBase JSON file. + """ + factory = kbaseapi.KBaseObjectFactory() + model = factory.build_object_from_file(filename, "KBaseFBA.FBAModel") + return MSModelUtil(model) -class MSModelUtil: def __init__(self, model): self.model = model self.pkgmgr = MSPackageManager.get_pkg_mgr(model) + self.wsid = None self.atputl = None self.gfutl = None self.metabolite_hash = None self.search_metabolite_hash = None self.test_objective = None + self.reaction_scores = None self.score = None + self.breaking_reaction = None + self.integrated_gapfillings = [] + self.attributes = {} + self.atp_tests = None + self.reliability_scores = None + self.util=None + if hasattr(self.model, "computed_attributes"): + if self.model.computed_attributes: + self.attributes = self.model.computed_attributes + if "pathways" not in self.attributes: + self.attributes["pathways"] = {} + if "auxotrophy" not in self.attributes: + self.attributes["auxotrophy"] = {} + if "fbas" not in self.attributes: + self.attributes["fbas"] = {} + + ########I/O functions + @staticmethod + def from_cobrapy(filename): + """ + Loads a cobrapy model from a file. + + Parameters + ---------- + filename: str + The name of the file to load the model from. + + Returns + ------- + MSModelUtil + An MSModelUtil object containing the loaded model. + """ + if filename[-5:].lower() == ".json": + model = cobra.io.load_json_model(filename) + elif filename[-4:].lower() == ".xml": + #Resetting the logging level in cobrapy to avoid excess output + logging.getLogger("cobra.io.sbml").setLevel(logging.ERROR) + model = cobra.io.read_sbml_model(filename) + else: + model = cobra.io.from_json(filename) + return MSModelUtil(model) + + def save_model(self, filename, format="json"): + """ + Saves the associated cobrapy model to a json file + + Parameters + ---------- + filename: name of the file the model should be saved to + """ + if format == "json": + cobra.io.save_json_model(self.model, filename) + elif format == "xml": + cobra.io.write_sbml_model(self.model, filename) + + def printlp(self,model=None,path="",filename="debug",print=False): + if print: + if len(path) > 0: + path+"/" + lpfilename = path+filename+".lp" + if model == None: + model = self.model + with open(lpfilename, "w") as out: + out.write(str(model.solver)) + + def print_solutions(self, solution_hash,filename="reaction_solutions.csv"): + records = [] + for rxn in self.model.reactions: + record = {"id":rxn.id,"name":rxn.name,"equation":rxn.build_reaction_string(use_metabolite_names=True)} + records.append(record) + for key in solution_hash: + record[key] = solution_hash[key].fluxes[rxn.id] + df = pd.DataFrame.from_records(records) + df.to_csv(filename) + + ########FBA utility functions + def set_media(self, media): + """ + Sets the media of the model from a media object or dictionary + + Parameters + ---------- + media: MSMedia object | dict : media object or dictionary with media formulation + """ + if isinstance(media, dict): + from modelseedpy.core.msmedia import MSMedia + media = MSMedia.from_dict(media) + self.pkgmgr.getpkg("KBaseMediaPkg").build_package(media) + + ########Functions related to ATP gapfilling method + def get_atputl(self,atp_media_filename=None,core_template=None,gapfilling_delta=0,max_gapfilling=0,forced_media=[],remake_atputil=False): + """ + Returns and creates, if needed, an atp correction object for the model + + Parameters + ---------- + core_template (optional) : MSTemplate object with core reactions + atp_media_filename (optional) : string to tsv file with ATP media formulations + gapfilling_delta (optional) : maximum difference in gapfilling to accept ATP condition + max_gapfilling (optional) : maximum gapfilling allowable to accept an ATP growth condition + forced_media (optional) : list of media in which model MUST make ATP + + Returns + ------- + MSATPCorrection : Object for ATP correction + + Raises + ------ + """ + if not self.atputl or remake_atputil: + from modelseedpy.core.msatpcorrection import MSATPCorrection + self.atputl = MSATPCorrection( + self,core_template,[], + load_default_medias=True, + max_gapfilling=max_gapfilling, + gapfilling_delta=gapfilling_delta, + forced_media=forced_media, + default_media_path=atp_media_filename + ) + self.atputl = MSATPCorrection(self.model) + return self.atputl + + def get_atp_tests(self,core_template=None,atp_media_filename=None,recompute=False,remake_atputil=False): + """ + Attempts to get ATP tests from attributes and failing that compute denovo using MSATPCorrection + + Parameters + ---------- + core_template (optional) : MSTemplate object with core reactions + atp_media_filename (optional) : string to tsv file with ATP media formulations - def printlp(self, lpfilename="debug.lp"): - with open(lpfilename, "w") as out: - out.write(str(self.model.solver)) + Returns + ------- + list<{"media":obj media,"is_max_threshold":bool,"threshold":float,"objective":string}> + List of test specifications + + Raises + ------ + """ + #Creating MSATPCorrection object which we need regardless + atpcorrection = self.get_atputl(core_template=core_template,atp_media_filename=atp_media_filename,remake_atputil=remake_atputil) + #Returning cached tests if available + if self.atp_tests and not recompute: + return self.atp_tests + #Attempting to pull ATP tests from attributes + if not recompute: + logger.debug("Getting tests from attributes") + atp_analysis = self.get_attributes("ATP_analysis",None) + if atp_analysis: + if "tests" in atp_analysis: + self.atp_tests = [] + for item in atp_analysis["tests"]: + if item in atpcorrection.media_hash: + self.atp_tests.append({ + "media":atpcorrection.media_hash[item], + "is_max_threshold":True, + "threshold":atp_analysis["tests"][item]["threshold"], + "objective":atp_analysis["tests"][item]["objective"] + }) + return self.atp_tests + else: + logger.warning("tests attribute missing in ATP analysis. Must recalculate ATP tests!") + else: + logger.warning("ATP analysis attributes missing. Must recalculate ATP tests!") + #If recompute called for or if attributes are missing, recompute tests + if not core_template: + logger.warning("Cannot recompute ATP tests without a core template!") + return None + self.atp_tests = atpcorrection.build_tests() + return self.atp_tests + + def compute_automated_reaction_scores(self): + """ + Computes reaction scores automatically from model data + :return: + """ + self.reaction_scores = {} def build_metabolite_hash(self): self.metabolite_hash = {} self.search_metabolite_hash = {} for met in self.model.metabolites: + if len(met.id.split("_")) == 2: + self.add_name_to_metabolite_hash(met.id.split("_")[0],met) self.add_name_to_metabolite_hash(met.id, met) self.add_name_to_metabolite_hash(met.name, met) for anno in met.annotation: - if isinstance(met.annotation[anno], list): - for item in met.annotation[anno]: + if isinstance(met.annotation[anno], list) or isinstance(met.annotation[anno], set): + for item in list(met.annotation[anno]): self.add_name_to_metabolite_hash(item, met) else: self.add_name_to_metabolite_hash(met.annotation[anno], met) @@ -101,33 +572,47 @@ def build_metabolite_hash(self): def add_name_to_metabolite_hash(self, name, met): if name not in self.metabolite_hash: self.metabolite_hash[name] = [] - self.metabolite_hash[name].append(met) - sname = search_name(name) + if met not in self.metabolite_hash[name]: + self.metabolite_hash[name].append(met) + sname = MSModelUtil.search_name(name) if sname not in self.search_metabolite_hash: self.search_metabolite_hash[sname] = [] - self.search_metabolite_hash[sname].append(met) + if met not in self.search_metabolite_hash[sname]: + self.search_metabolite_hash[sname].append(met) - def find_met(self, name): + def find_met(self, name, compartment=None): if self.metabolite_hash == None: self.build_metabolite_hash() if name in self.metabolite_hash: - return self.metabolite_hash[name] - sname = search_name(name) + if not compartment: + return self.metabolite_hash[name] + for met in self.metabolite_hash[name]: + array = met.id.split("_") + if array[1] == compartment or met.compartment == compartment: + return [met] + return [] + sname = MSModelUtil.search_name(name) if sname in self.search_metabolite_hash: - return self.search_metabolite_hash[sname] - logger.info(name, " not found in model!") + if not compartment: + return self.search_metabolite_hash[sname] + for met in self.search_metabolite_hash[sname]: + array = met.id.split("_") + if array[1] == compartment or met.compartment == compartment: + return [met] + return [] + logger.info(name + " not found in model!") return [] def rxn_hash(self): output = {} for rxn in self.model.reactions: - strings = stoichiometry_to_string(rxn.metabolites) + strings = MSModelUtil.stoichiometry_to_string(rxn.metabolites) output[strings[0]] = [rxn, 1] output[strings[1]] = [rxn, -1] return output def find_reaction(self, stoichiometry): - output = stoichiometry_to_string(stoichiometry) + output = MSModelUtil.stoichiometry_to_string(stoichiometry) atpstring = output[0] rxn_hash = self.rxn_hash() if atpstring in rxn_hash: @@ -137,7 +622,7 @@ def find_reaction(self, stoichiometry): def msid_hash(self): output = {} for cpd in self.model.metabolites: - msid = metabolite_msid(cpd) + msid = MSModelUtil.metabolite_msid(cpd) if msid != None: if msid not in output: output[msid] = [] @@ -147,10 +632,134 @@ def msid_hash(self): def exchange_list(self): exchange_reactions = [] for reaction in self.model.reactions: - if reaction.id[:3] == "EX_": + if reaction.id[:3] in ["EX_","EXF"]: exchange_reactions.append(reaction) return exchange_reactions + def nonexchange_reaction_count(self): + count = 0 + for reaction in self.model.reactions: + if ( + reaction.id[:3] != "EX_" + and reaction.id[:3] != "SK_" + and reaction.id[:3] != "DM_" + and reaction.id[:3] != "bio" + ): + if reaction.upper_bound > 0 or reaction.lower_bound < 0: + count += 1 + return count + + def reaction_scores(self): + return {} + + ################################################################################# + # Functions related to phenotype simultion + # Design philosophy: the phenotype types should be aware of phenotype data and + # agnostic to the model, so this code handles how to simulate a phenotype in a + # model. This code sets the model objective based on the phenotype type and adds + # the appropriate exchange reactions. + ################################################################################# + def set_objective_from_phenotype(self,phenotype,missing_transporters=[],create_missing_compounds=False): + if phenotype.type == "growth": + if "bio1" in self.model.reactions: + self.model.objective = "bio1" + else: + logger.critical(phenotype.id+": growth phenotype but could not find biomass reaction!") + return None + if phenotype.type == "uptake" or phenotype.type == "excretion": + uptake = excretion = 0 + if phenotype.type == "uptake": + uptake = 1000 + else: + excretion = 1000 + if len(phenotype.additional_compounds) == 0: + logger.critical(phenotype.id+": can't set uptake or excretion objective without additional compounds specified!") + return None + first = True + for cpd in phenotype.additional_compounds: + exid = "EX_"+cpd+"_e0" + if exid not in self.model.reactions: + exid = "EX_"+cpd+"_c0" + if exid not in self.model.reactions: + exmets = self.find_met(cpd,"c0") + if len(exmets) == 0: + if create_missing_compounds: + exmets = [Metabolite(cpd+"_c0",name=cpd+"_c0",compartment="c0")] + self.model.add_metabolites(exmets) + else: + logger.warning(phenotype.id+": could not find metabolite for "+cpd) + return None + self.add_exchanges_for_metabolites(exmets,uptake=uptake,excretion=excretion) + missing_transporters.append(cpd) + if first: + self.model.objective = exid + first = False + else: + self.model.objective += exid + if phenotype.type == "excretion": + for reaction in self.model.reactions: + if reaction.objective_coefficient != 0: + reaction.objective_coefficient = -1*reaction.objective_coefficient + self.model.objective.direction = 'max' + return str(self.model.objective) + + ################################################################################# + # Functions related to exchanges and transport reactions + ################################################################################# + def add_transport_and_exchange_for_metabolite(self, met,direction="=",prefix="trans",override=False): + #If met is a string, attempt to find the associated metabolite + if isinstance(met,str): + mets = self.find_met(met) + if len(mets) == 0: + logger.critical("Metabolite "+met+" not found in model") + return None + met = mets[0] + #Breaking down the ID to see the compartment and index - ID must take form _ + output = MSModelUtil.parse_id(met) + if not output: + logger.critical("Transport metabolite ID " + met.id + " not in proper format") + return None + (baseid,compartment,index) = output + #Checking if exchange already exists + if baseid+"_e0" in self.model.metabolites and not override: + logger.critical("Transport reaction appears to already exist for " + met.id+". Override if transport still desired.") + return None + elif baseid+"_e0" not in self.model.metabolites: + exmet = Metabolite(baseid+"_e0",name=met.name+"_e0",compartment="e0",charge=met.charge,formula=met.formula) + self.model.add_metabolites([exmet]) + else: + exmet = self.model.metabolites.get_by_id(baseid+"_e0") + #Checking charge so transport will be charge balanced + hmet = None + exhmet = None + if met.charge != 0: + #Finding H+ compound in model: + output = self.find_met("cpd00067",compartment+str(index)) + if len(output) > 0: + hmet = output[0] + output = self.find_met("cpd00067","e0") + if len(output) > 0: + exhmet = output[0] + if not hmet or not exhmet: + logger.warning("No H+ metabolite found in model") + stoich = {met:-1,exmet:1} + if met.charge != 0 and hmet and exhmet: + stoich[hmet] = met.charge + stoich[exhmet] = -1*met.charge + transport = Reaction(prefix + met.id + "_"+compartment+str(index)) + transport.name = "Charge-nuetral transport for " + met.name + transport.add_metabolites(stoich) + transport.annotation["sbo"] = "SBO:0000185" + transport.upper_bound = 0 + transport.lower_bound = 0 + if direction == ">" or direction == "=": + transport.upper_bound = 1000 + if direction == "<" or direction == "=": + transport.lower_bound = -1000 + self.model.add_reactions([transport]) + self.add_exchanges_for_metabolites([exmet],0,1000) + return transport + def exchange_hash(self): exchange_reactions = {} exlist = self.exchange_list() @@ -161,7 +770,7 @@ def exchange_hash(self): else: logger.warn("Nonstandard exchange reaction ignored:" + reaction.id) return exchange_reactions - + def add_missing_exchanges(self, media): output = [] exchange_hash = self.exchange_hash() @@ -207,8 +816,243 @@ def add_exchanges_for_metabolites( self.model.add_reactions(drains) return drains - def reaction_scores(self): - return {} + ################################################################################# + # Functions related to editing the model + ################################################################################# + def get_attributes(self, key=None, default=None): + if not key: + return self.attributes + if key not in self.attributes: + self.attributes[key] = default + return self.attributes[key] + + def save_attributes(self, value=None, key=None): + if value: + if key: + self.attributes[key] = value + else: + self.attributes = value + if hasattr(self.model, "computed_attributes"): + logger.info("Setting FBAModel computed_attributes to mdlutl attributes") + self.attributes["gene_count"] = len(self.model.genes) + self.model.computed_attributes = self.attributes + + def add_ms_reaction(self, rxn_dict, compartment_trans=["c0", "e0"]): + modelseed = ModelSEEDBiochem.get() + output = [] + for rxnid, compartment in rxn_dict.items(): + fullid = rxnid + "_" + compartment + modelseed_reaction = modelseed.get_seed_reaction(rxnid) + reaction_stoich = modelseed_reaction.cstoichiometry + cobra_reaction = Reaction(fullid) + output.append(cobra_reaction) + cobra_reaction.name = modelseed_reaction.data["name"] + "_" + compartment + metabolites_to_add = {} + for metabolite, stoich in reaction_stoich.items(): + id = metabolite[0] + compound = modelseed.get_seed_compound(id).data + compartment_number = int(metabolite[1]) + if compartment_number > len(compartment_trans): + logger.critical( + "Compartment index " + str(compartment_number) + " out of range" + ) + compartment_string = compartment_trans[compartment_number] + met_output = self.find_met(id, compartment_string) + cobramet = None + if met_output: + cobramet = met_output[0] + else: + cobramet = Metabolite( + id + "_" + compartment_string, + name=compound["name"] + "_" + compartment_string, + compartment=compartment_string, + ) + metabolites_to_add[cobramet] = stoich + cobra_reaction.add_metabolites(metabolites_to_add) + cobra_reaction.reaction + self.model.add_reactions(output) + return output + + ################################################################################# + # Functions related to utility functions + ################################################################################# + def assign_reliability_scores_to_reactions(self,active_reaction_sets=[]): + """Assigns a reliability score to every model reaction which indicates how likely the reaction is to be accurate and to take place + + Returns + ------- + { reaction ID : { reaction direction : score } } + """ + active_rxn_dictionary={} + for item in active_reaction_sets: + for array in item: + if array[0] not in active_rxn_dictionary: + active_rxn_dictionary[array[0]] = {} + if array[1] not in active_rxn_dictionary[array[0]]: + active_rxn_dictionary[array[0]][array[1]] = 0 + active_rxn_dictionary[array[0]][array[1]]+=1 + if self.reliability_scores == None: + self.reliability_scores = {} + biochem = ModelSEEDBiochem.get() + for reaction in self.model.reactions: + #Pulling model reaction related data + transported_charge = 0 + for met in reaction.metabolites: + coef = reaction.metabolites[met] + if met.id.split("_")[-1][0:1] == "e": + transported_charge += coef * met.charge + #Pulling ModelSEED Biochemistry related data + msid = MSModelUtil.reaction_msid(reaction) + if msid and msid != "rxn00000" and msid in biochem.reactions: + #Penalizing for net transport of ions in the wrong direction + forwardscore = 0 + reversescore = 0 + if transported_charge > 0: + forwardscore += 50*transported_charge + if transported_charge < 0: + reversescore += -50*transported_charge + basescore = 0 + msrxn = biochem.reactions.get_by_id(msid) + #Penalizing for mass imbalance + if msrxn.status[0:2] == "MI": + basescore = 1000 + #Penalizing for charge imbalance + if msrxn.status[0:2] == "CI": + basescore = 800 + #Penalizing if no pathways TODO - cannot use this with ModelSEEDDatabase instead of ModelSEEDBiochem + #if msrxn["pathways"] == None: + # basescore = 50 + #Penalizing if there is no deltaG + if msrxn.delta_g == 10000000 or msrxn.delta_g == None: + basescore = 200 + else: + #Penalizing in the direction of infeasiblility + if msrxn.delta_g <= -5: + reversescore += 20 + if msrxn.delta_g <= -10: + reversescore += 20 + if msrxn.delta_g >= 5: + forwardscore += 20 + if msrxn.delta_g >= 10: + forwardscore += 20 + #Penalizing reactions in direction of production of ATP + for cpd in msrxn.metabolites: + if cpd.id == "cpd00002": + if msrxn.metabolites[cpd] < 0: + reversescore += 100 + elif msrxn.metabolites[cpd] > 0: + forwardscore += 100 + if cpd.inchi_key == None: + basescore += 40 + if cpd.formula == None: + basescore += 60 + if cpd.delta_g == 10000000 or cpd.delta_g == None: + basescore += 20 + self.reliability_scores[reaction.id] = {} + self.reliability_scores[reaction.id][">"] = basescore+forwardscore + self.reliability_scores[reaction.id]["<"] = basescore+reversescore + elif reaction.id[0:3] == "EX_" or reaction.id[0:3] == "SK_" or reaction.id[0:3] == "DM_" or reaction.id[0:3] == "bio": + self.reliability_scores[reaction.id] = {} + self.reliability_scores[reaction.id][">"] = -10 + self.reliability_scores[reaction.id]["<"] = -10 + else: + self.reliability_scores[reaction.id] = {} + self.reliability_scores[reaction.id][">"] = 1000 + self.reliability_scores[reaction.id]["<"] = 1000 + for_multiplier = 1 + rev_multiplier = 1 + if reaction.id in active_rxn_dictionary: + if ">" in active_rxn_dictionary[reaction.id]: + for_multiplier += 0.1*active_rxn_dictionary[reaction.id][">"] + if "<" in active_rxn_dictionary[reaction.id]: + rev_multiplier += 0.1*active_rxn_dictionary[reaction.id]["<"] + self.reliability_scores[reaction.id][">"] = self.reliability_scores[reaction.id][">"]*for_multiplier + self.reliability_scores[reaction.id]["<"] = self.reliability_scores[reaction.id]["<"]*rev_multiplier + return self.reliability_scores + + def is_core(self,rxn): + """Indicates if a specified reaction is a core reaction + + Parameters + ---------- + reaction: Raction|string + + Returns + ------- + bool + """ + if not isinstance(rxn, str): + rxn = rxn.id + if "core_reactions" in self.get_attributes(): + logger.debug("Using core reactions attribute!") + if rxn in self.get_attributes("core_reactions"): + return True + return False + elif rxn in core_rxns: + return True + return False + + def build_model_data_hash(self): + data = { + "Model": self.id, + "Genome": self.genome.info.metadata["Name"], + "Genes": self.genome.info.metadata["Number of Protein Encoding Genes"], + } + return data + + def compare_reactions(self, reaction_list, filename): + data = {} + for rxn in reaction_list: + for met in rxn.metabolites: + if met.id not in data: + data[met.id] = {} + for other_rxn in reaction_list: + data[met.id][other_rxn.id] = 0 + data[met.id][rxn.id] = rxn.metabolites[met] + df = pd.DataFrame(data) + df = df.transpose() + df.to_csv(filename) + + ################################################################################# + # Functions related to managing biomass reactions + ################################################################################# + def evaluate_biomass_reaction_mass(self, biomass_rxn_id, normalize=False): + biorxn = self.model.reactions.get_by_id(biomass_rxn_id) + # First computing energy biosynthesis coefficients + atp = None + atp_compounds = { + "cpd00002": -1, + "cpd00001": -1, + "cpd00008": 1, + "cpd00009": 1, + "cpd00067": 1, + } + mass_compounds = {"cpd11463": 1, "cpd11461": 1, "cpd11462": 1} + process_compounds = {"cpd17041": 1, "cpd17042": 1, "cpd17043": 1} + for met in biorxn.metabolites: + msid = self.metabolite_msid(met) + if msid == "cpd00008": + atp = abs(biorxn.metabolites[met]) + # Computing non ATP total mass + total = 0 + for met in biorxn.metabolites: + msid = self.metabolite_msid(met) + if msid == "cpd11416": + continue + coef = biorxn.metabolites[met] + if msid in mass_compounds: + total += coef + elif msid in process_compounds: + total += 0 + else: + mw = FBAHelper.metabolite_mw(met) + if msid in atp_compounds: + if coef < 0: + coef += atp + else: + coef += -1 * atp + total += mw * coef / 1000 + return {"ATP": atp, "Total": total} # Required this function to add gapfilled compounds to a KBase model for saving gapfilled model def convert_cobra_compound_to_kbcompound(self, cpd, kbmodel, add_to_model=1): @@ -320,83 +1164,233 @@ def convert_cobra_reaction_to_kbreaction( kbmodel["modelreactions"].append(rxn_data) return rxn_data - def add_gapfilling_solution_to_kbase_model( - self, - newmodel, - gapfilled_reactions, - gfid=None, - media_ref=None, - reaction_genes=None, - ): + ################################################################################# + # Functions related to gapfilling of models + ################################################################################# + def convert_solution_to_list(self,solution): + """Converting solution to list format, which is easier to work with + Parameters + ---------- + solution : dict + Specifies the reactions to be added to the model to implement the gapfilling solution """ - NOTE: to be moved to cobrakbase + output = [] + for label in ["new","reversed"]: + for rxn_id in solution[label]: + output.append([rxn_id, solution[label][rxn_id],label]) + return output + + def find_item_in_solution(self,input_list,input,ignore_dir=False): + for item in input_list: + if input[0] == item[0] and input[1] == item[1]: + return True + elif ignore_dir and input[0] == item[0]: + return True + return False + + def test_solution(self,solution,targets,medias,thresholds=[0.1],remove_unneeded_reactions=False,do_not_remove_list=[]): + """Tests if every reaction in a given gapfilling solution is actually needed for growth. Note, this code assumes the gapfilling solution is already integrated. + + Parameters + ---------- + solution : {"new":{string reaction_id: string direction},"reversed":{string reaction_id: string direction}} + or + list> + Data for gapfilling solution to be tested + target : string, + media : MSMedia, + threshold : float, default 0.1 + + Returns + ------- + list> + List of unneeded reactions + + Raises + ------ """ - rxn_table = [] - gapfilling_obj = None - if gfid == None: - largest_index = 0 - for gapfilling in newmodel["gapfillings"]: - current_index = int(gapfilling["id"].split(".").pop()) - if largest_index == 0 or largest_index < current_index: - largest_index = current_index - largest_index += 1 - gfid = "gf." + str(largest_index) + #Saving the current objective + current_objective = self.model.objective + #Saving the current media + current_media = self.pkgmgr.getpkg("KBaseMediaPkg").current_media + #Computing the initial objective values + initial_objectives = [] + for (i,target) in enumerate(targets): + #Setting the media + self.pkgmgr.getpkg("KBaseMediaPkg").build_package(medias[i]) + #Setting the objective + self.model.objective = target + #Computing the objective value + objective = self.model.slim_optimize() + initial_objectives.append(objective) + logger.debug("Starting objective for " + medias[i].id + "/"+target+" = " + str(objective)) + #Iterating through solution reactions and flagging them if they are unneeded to achieve the specified minimum objective + unneeded = [] + #If object is a dictionary, convert to a list + if isinstance(solution,dict): + solution = self.convert_solution_to_list(solution) + #Processing solution in standardized format + for item in solution: + rxn_id = item[0] + other_original_bound = None + rxnobj = self.model.reactions.get_by_id(rxn_id) + if item[1] == ">": + original_bound = rxnobj.upper_bound + if rxnobj.lower_bound > 0: + other_original_bound = rxnobj.lower_bound + else: + original_bound = rxnobj.lower_bound + if rxnobj.upper_bound < 0: + other_original_bound = rxnobj.upper_bound + #Testing all media and target and threshold combinations to see if the reaction is needed + needed = False + for (i,target) in enumerate(targets): + if len(targets) > 1:#If there's only one target, then these steps were done earlier already + #Setting the media + self.pkgmgr.getpkg("KBaseMediaPkg").build_package(medias[i]) + #Setting the objective + self.model.objective = target + #Knocking out the reaction to test for the impact on the objective + #This has to happen after media is applied in case the reaction is an exchange + if item[1] == ">": + if rxnobj.lower_bound > 0: + rxnobj.lower_bound = 0 + rxnobj.upper_bound = 0 + else: + if rxnobj.upper_bound < 0: + rxnobj.upper_bound = 0 + rxnobj.lower_bound = 0 + #Computing the objective value + objective = self.model.slim_optimize() + if objective < thresholds[i]: + needed = True + logger.info( + medias[i].id + "/" + target + ":" +rxn_id + + item[1] + + " needed:" + + str(objective) + + " with min obj:" + + str(thresholds[i]) + ) + #If the reaction isn't needed for any media and target combinations, add it to the unneeded list + if not needed: + unneeded.append([rxn_id, item[1], item[2],original_bound,other_original_bound]) + logger.info( + rxn_id + + item[1] + + " not needed:" + + str(objective) + ) + #VERY IMPORTANT: Leave the reaction knocked out for now so we screen for combinatorial effects + else: + #Restore the reaction if it is needed + if item[1] == ">": + rxnobj.upper_bound = original_bound + if other_original_bound != None: + rxnobj.lower_bound = other_original_bound + else: + rxnobj.lower_bound = original_bound + if other_original_bound != None: + rxnobj.upper_bound = other_original_bound + if not remove_unneeded_reactions: + #Restoring the bounds on the unneeded reactions + for item in unneeded: + rxnobj = self.model.reactions.get_by_id(item[0]) + if item[1] == ">": + rxnobj.upper_bound = item[3] + if item[4] != None: + rxnobj.lower_bound = item[4] + else: + rxnobj.lower_bound = item[3] + if item[4] != None: + rxnobj.upper_bound = item[4] else: - for gapfilling in newmodel["gapfillings"]: - if gapfilling["id"] == gfid: - gapfilling_obj = gapfilling - if gapfilling_obj == None: + #Do not restore bounds on unneeded reactions and remove reactions from model if their bounds are zero + removed_rxns = [] + for item in unneeded: + rxnobj = self.model.reactions.get_by_id(item[0]) + if self.find_item_in_solution(do_not_remove_list,item): + if item[1] == ">": + rxnobj.upper_bound = item[3] + if item[4] != None: + rxnobj.lower_bound = item[4] + else: + rxnobj.lower_bound = item[3] + if item[4] != None: + rxnobj.upper_bound = item[4] + elif rxnobj.lower_bound == 0 and rxnobj.upper_bound == 0 and not self.find_item_in_solution(do_not_remove_list,item,ignore_dir=True): + removed_rxns.append(rxnobj) + if len(removed_rxns) > 0: + self.model.remove_reactions(removed_rxns) + #Restoring the original objective + self.model.objective = current_objective + #Restoring the original media + if current_media: + self.pkgmgr.getpkg("KBaseMediaPkg").build_package(current_media) + #Returning the unneeded list + return unneeded + + def add_gapfilling(self, solution): + logger.info("Adding gapfilling:"+str(solution)) + self.integrated_gapfillings.append(solution) + + def create_kb_gapfilling_data(self, kbmodel, atpmedia_ws="94026"): + gapfilling_hash = {} + if "gapfillings" not in kbmodel: + kbmodel["gapfillings"] = [] + for gapfilling in kbmodel["gapfillings"]: + gapfilling_hash[gapfilling["id"]] = gapfilling + rxn_hash = {} + for rxn in kbmodel["modelreactions"]: + rxn_hash[rxn["id"]] = rxn + for gf in self.integrated_gapfillings: + media_ref = "KBaseMedia/Empty" + gf["media"].id.replace("/", ".") + gfid = gf["media"].id + if self.atputl: + for item in self.atputl.atp_medias: + if item[0] == gf["media"]: + gfid = "ATP-" + gfid + media_ref = atpmedia_ws + "/" + gf["media"].id + ".atp" + break + if hasattr(gf["media"], "info"): + media_ref = gf["media"].info.workspace_id + "/" + gf["media"].info.id + suffix = 0 + while gfid in gapfilling_hash: + suffix += 1 + gfid += "." + str(suffix) + gapfilling_hash[gfid] = 1 gapfilling_obj = { - "gapfill_id": newmodel["id"] + "." + gfid, + "gapfill_id": gfid, "id": gfid, "integrated": 1, "integrated_solution": "0", + "target": gf["target"], + "minobjective": gf["minobjective"], + "binary_check": gf["binary_check"], "media_ref": media_ref, } - newmodel["gapfillings"].append(gapfilling_obj) - cpd_hash = {} - for cpd in newmodel["modelcompounds"]: - cpd_hash[cpd["id"]] = cpd - for rxn in gapfilled_reactions["new"]: - reaction = self.model.reactions.get_by_id(rxn) - kbrxn = self.convert_cobra_reaction_to_kbreaction( - reaction, - newmodel, - cpd_hash, - gapfilled_reactions["new"][rxn], - 1, - reaction_genes, - ) - kbrxn["gapfill_data"][gfid] = dict() - kbrxn["gapfill_data"][gfid]["0"] = [gapfilled_reactions["new"][rxn], 1, []] - # rxn_table.append({ - # 'id':kbrxn["id"], - # 'name':kbrxn["name"], - # 'direction':format_direction(kbrxn["direction"]), - # 'gene':format_gpr(kbrxn), - # 'equation':format_equation(kbrxn,cpd_hash), - # 'newrxn':1 - # }) - for rxn in gapfilled_reactions["reversed"]: - for kbrxn in newmodel["modelreactions"]: - if kbrxn["id"] == rxn: - kbrxn["direction"] = "=" - # rxn_table.append({ - # 'id':kbrxn["id"], - # 'name':kbrxn["name"], - # 'direction':format_direction(kbrxn["direction"]), - # 'gene':format_gpr(kbrxn), - # 'equation':format_equation(kbrxn,cpd_hash), - # 'newrxn':0 - # }) - kbrxn["gapfill_data"][gfid] = dict() - kbrxn["gapfill_data"][gfid]["0"] = [ - gapfilled_reactions["reversed"][rxn], - 1, - [], - ] - return rxn_table + kbmodel["gapfillings"].append(gapfilling_obj) + for rxn in gf["new"]: + if rxn in rxn_hash: + rxnobj = rxn_hash[rxn] + if "gapfill_data" not in rxnobj: + rxnobj["gapfill_data"] = {} + if gfid not in rxnobj["gapfill_data"]: + rxnobj["gapfill_data"][gfid] = {"0": [gf["new"][rxn], 1, []]} + for rxn in gf["reversed"]: + if rxn in rxn_hash: + rxnobj = rxn_hash[rxn] + if "gapfill_data" not in rxnobj: + rxnobj["gapfill_data"] = {} + if gfid not in rxnobj["gapfill_data"]: + rxnobj["gapfill_data"][gfid] = { + "0": [gf["reversed"][rxn], 1, []] + } + ################################################################################# + # Functions related to applying, running, and expanding with test conditions + ################################################################################# def apply_test_condition(self, condition, model=None): """Applies constraints and objective of specified condition to model @@ -421,13 +1415,13 @@ def apply_test_condition(self, condition, model=None): else: pkgmgr = MSPackageManager.get_pkg_mgr(model) model.objective = condition["objective"] - if condition["is_max_threshold"]: - model.objective.direction = "max" - else: - model.objective.direction = "min" + #if condition["is_max_threshold"]: + model.objective.direction = "max" + #else: TODO - need to revisit this + # model.objective.direction = "min" pkgmgr.getpkg("KBaseMediaPkg").build_package(condition["media"]) - def test_single_condition(self, condition, apply_condition=True, model=None): + def test_single_condition(self, condition, apply_condition=True, model=None,report_atp_loop_reactions=False,analyze_failures=False,rxn_list=[]): """Runs a single test condition to determine if objective value on set media exceeds threshold Parameters @@ -454,33 +1448,44 @@ def test_single_condition(self, condition, apply_condition=True, model=None): new_objective = model.slim_optimize() value = new_objective if "change" in condition and condition["change"]: - if self.test_objective is not None: + if self.test_objective: value = new_objective - self.test_objective + logger.debug( + condition["media"].id + + " testing for change:" + + str(value) + + "=" + + str(new_objective) + + "-" + + str(self.test_objective) + ) self.score = value if model.solver.status != "optimal": - self.printlp("Infeasible.lp") - logger.critical("Infeasible problem - LP file printed to debug!") + self.printlp(condition["media"].id + "-Testing-Infeasible.lp") + logger.critical( + condition["media"].id + + "testing leads to infeasible problem. LP file printed to debug!" + ) return False if value >= condition["threshold"] and condition["is_max_threshold"]: - logger.debug( - "Failed high:" - + str(self.test_objective) - + ";" - + str(condition["threshold"]) - ) + logger.debug("Failed high:"+condition["media"].id+":"+str(new_objective)+";"+str(condition["threshold"])) + if analyze_failures and len(rxn_list) == 1: + #Constraining test objective at failed value + if value > 1000: + value = 1000 + self.model.reactions.get_by_id(condition["objective"]).lower_bound = value + solution = pfba(self.model) + self.analyze_minimal_reaction_set(solution,rxn_list[0][0].id) + self.model.reactions.get_by_id(condition["objective"]).lower_bound = 0 return False elif value <= condition["threshold"] and not condition["is_max_threshold"]: - logger.debug( - "Failed low:" - + str(self.test_objective) - + ";" - + str(condition["threshold"]) - ) + logger.debug("Failed low:"+condition["media"].id+":"+str(new_objective)+";"+str(condition["threshold"])) return False self.test_objective = new_objective + logger.debug("Passed:"+condition["media"].id+":"+str(new_objective)+";"+str(condition["threshold"])) return True - def test_condition_list(self, condition_list: list, model=None): + def test_condition_list(self, condition_list, model=None,positive_growth=[],rxn_list=[]): """Runs a set of test conditions to determine if objective values on set medias exceed thresholds Parameters @@ -501,13 +1506,65 @@ def test_condition_list(self, condition_list: list, model=None): if model == None: model = self.model for condition in condition_list: - if not self.test_single_condition(condition, True, model): + if not self.test_single_condition(condition,apply_condition=True,model=model,rxn_list=rxn_list): return False return True - def reaction_expansion_test(self, reaction_list: list, condition_list: list): - """Adds reactions in reaction list one by one and appplies tests, filtering reactions that fail + def linear_expansion_test(self, reaction_list, condition, currmodel,positive_growth=[]): + """Tests addition of reactions one at a time + Parameters + ---------- + reaction_list : list<[obj reaction,{>|>}]> + List of reactions and directions to test for addition in the model (should already be in model) + + Returns + ------- + list<[obj reaction,{>|>}]> + List of reactions and directions filtered because they fail tests when in the model + + Raises + ------ + """ + # First run the full test + if self.test_single_condition(condition, apply_condition=False, model=currmodel,positive_growth=positive_growth): + return [] + # First knockout all reactions in the input list and save original bounds + filtered_list = [] + original_bound = [] + for item in reaction_list: + if item[1] == ">": + original_bound.append(item[0].upper_bound) + item[0].upper_bound = 0 + else: + original_bound.append(item[0].lower_bound) + item[0].lower_bound = 0 + # Now restore reactions one at a time + count = 0 + for item in reaction_list: + if item[1] == ">": + item[0].upper_bound = original_bound[count] + if not self.test_single_condition(condition, apply_condition=False, model=currmodel): + # logger.debug(item[0].id+":"+item[1]) + item[0].upper_bound = 0 + if item not in filtered_list: + item.append(original_bound[count]) + item.append(self.score) + filtered_list.append(item) + else: + item[0].lower_bound = original_bound[count] + if not self.test_single_condition(condition, apply_condition=False, model=currmodel): + # logger.debug(item[0].id+":"+item[1]) + item[0].lower_bound = 0 + if item not in filtered_list: + item.append(original_bound[count]) + item.append(self.score) + filtered_list.append(item) + count += 1 + return filtered_list + + def binary_expansion_test(self, reaction_list, condition, currmodel, depth=0,positive_growth=[]): + """Conducts a binary search for bad reaction combinations Parameters ---------- reaction_list : list<[obj reaction,{>|>}]> @@ -523,54 +1580,429 @@ def reaction_expansion_test(self, reaction_list: list, condition_list: list): Raises ------ """ - tic = time.perf_counter() - - logger.info( - f"Expansion started! reaction list: {len(reaction_list)} conditions: {len(condition_list)}" + newdepth = depth + 1 + filtered_list = [] + # First run the full test + if self.test_single_condition(condition,apply_condition=False,model=currmodel,rxn_list=reaction_list): + #logger.debug("Reaction set passed"," ".join(map(str, reaction_list))) + return [] + # Check if input list contains only one reaction: + if len(reaction_list) == 1: + #logger.debug("Failed:"+reaction_list[0][1]+reaction_list[0][0].id) + if reaction_list[0][1] == ">": + reaction_list[0].append(reaction_list[0][0].upper_bound) + reaction_list[0][0].upper_bound = 0 + else: + reaction_list[0].append(reaction_list[0][0].lower_bound) + reaction_list[0][0].lower_bound = 0 + #Check if the reaction passes the positive growth test + success = True + if len(positive_growth) > 0: + #Testing positive growth conditions + for pos_condition in positive_growth: + if not self.test_single_condition(pos_condition,apply_condition=True,model=currmodel): + logger.debug("Does not pass positive growth tests:"+reaction_list[0][1]+reaction_list[0][0].id) + success = False + break + #Restoring current test condition + self.apply_test_condition(condition) + if success: + reaction_list[0].append(self.score) + filtered_list.append(reaction_list[0]) + else: + #Restoring reaction + if reaction_list[0][1] == ">": + reaction_list[0][0].upper_bound = reaction_list[0][2] + else: + reaction_list[0][0].lower_bound = reaction_list[0][2] + self.breaking_reaction = reaction_list[0][0] + return filtered_list + # Break reaction list into two + original_bound = [] + sub_lists = [[], []] + midway_point = int(len(reaction_list) / 2) + for i, item in enumerate(reaction_list): + if item[1] == ">": + original_bound.append(item[0].upper_bound) + else: + original_bound.append(item[0].lower_bound) + if i < midway_point: + sub_lists[0].append(item) + else: + sub_lists[1].append(item) + if item[1] == ">": + item[0].upper_bound = 0 + else: + item[0].lower_bound = 0 + # Submitting first half of reactions for testing + new_filter = self.binary_expansion_test( + sub_lists[0], condition, currmodel,depth=newdepth,positive_growth=positive_growth + ) + for item in new_filter: + filtered_list.append(item) + if self.breaking_reaction != None: + logger.debug("Ending early due to breaking reaction:"+self.breaking_reaction.id) + return filtered_list + # Submitting second half of reactions for testing - now only breaking reactions are removed from the first list + for i, item in enumerate(reaction_list): + if i >= midway_point: + if item[1] == ">": + item[0].upper_bound = original_bound[i] + else: + item[0].lower_bound = original_bound[i] + new_filter = self.binary_expansion_test( + sub_lists[1], condition, currmodel,depth=newdepth,positive_growth=positive_growth ) + for item in new_filter: + filtered_list.append(item) + return filtered_list + def check_if_solution_exists(self, reaction_list, condition, model): + original_bound = [] + for i, item in enumerate(reaction_list): + if item[1] == ">": + original_bound.append(item[0].upper_bound) + item[0].upper_bound = 0 + else: + original_bound.append(item[0].lower_bound) + item[0].lower_bound = 0 + result = self.test_single_condition(condition,model=model) + for i, item in enumerate(reaction_list): + if item[1] == ">": + item[0].upper_bound = original_bound[i] + else: + item[0].lower_bound = original_bound[i] + return result + + def reaction_expansion_test( + self, + reaction_list, + condition_list, + binary_search=True, + attribute_label="gf_filter", + positive_growth=[], + resort_by_score=True, + active_reaction_sets=[] + ): + """Adds reactions in reaction list one by one and appplies tests, filtering reactions that fail + + Parameters + ---------- + reaction_list : list<[obj reaction,{>|>}]> + List of reactions and directions to test for addition in the model (should already be in model) + condition_list : list + Specifies set of conditions to be tested with media, objective, is_max_threshold, threshold. + + Returns + ------- + list<[obj reaction,{>|>}]> + List of reactions and directions filtered because they fail tests when in the model + + Raises + ------ + """ + logger.debug(f"Expansion started! Binary = {binary_search}") + self.breaking_reaction = None filtered_list = [] + if resort_by_score: + scores = self.assign_reliability_scores_to_reactions(active_reaction_sets=active_reaction_sets) + reaction_list = sorted(reaction_list, key=lambda x: scores[x[0].id][x[1]]) + for item in reaction_list: + logger.debug(item[0].id+":"+item[1]+":"+str(scores[item[0].id][item[1]])) for condition in condition_list: - logger.debug(f"testing condition {condition}") - currmodel = self.model + tic = time.perf_counter() + new_filtered = [] + if not self.check_if_solution_exists(reaction_list, condition, currmodel): + logger.debug("No solution exists that passes tests for condition "+condition["media"].id) + return None with currmodel: self.apply_test_condition(condition) - # First knockout all reactions in the input list and save original bounds - original_bound = [] - for item in reaction_list: - if item[1] == ">": - original_bound.append(item[0].upper_bound) - item[0].upper_bound = 0 - else: - original_bound.append(item[0].lower_bound) - item[0].lower_bound = 0 - # Now restore reactions one at a time - count = 0 - for item in reaction_list: - if item[1] == ">": - item[0].upper_bound = original_bound[count] - if not self.test_single_condition(condition, False, currmodel): - item[0].upper_bound = 0 + if binary_search: + done = False + while not done: + new_filtered = self.binary_expansion_test( + reaction_list, condition, currmodel,positive_growth=positive_growth + ) + for item in new_filtered: if item not in filtered_list: - item.append(original_bound[count]) - item.append(self.score) filtered_list.append(item) + if self.breaking_reaction == None: + done = True + else: + #Remove breaking reaction from reaction_list + logger.debug("Keeping breaking reaction:"+self.breaking_reaction.id) + for i in range(len(reaction_list)): + if reaction_list[i][0] == self.breaking_reaction: + del reaction_list[i] + break + if not self.check_if_solution_exists(reaction_list, condition, currmodel): + logger.debug("No solution exists after retaining breaking reaction:"+self.breaking_reaction.id) + return None + self.breaking_reaction = None + else: + new_filtered = self.linear_expansion_test( + reaction_list, condition, currmodel,positive_growth=positive_growth + ) + for item in new_filtered: + if item not in filtered_list: + filtered_list.append(item) + # Restoring knockout of newly filtered reactions, which expire after exiting the "with" block above + for item in new_filtered: + if item[1] == ">": + item[0].upper_bound = 0 + else: + item[0].lower_bound = 0 + toc = time.perf_counter() + logger.info( + "Expansion time:" + condition["media"].id + ":" + str((toc - tic)) + ) + logger.info( + "Filtered count:" + + str(len(filtered_list)) + + " out of " + + str(len(reaction_list)) + ) + # Adding filter results to attributes + gf_filter_att = self.get_attributes(attribute_label, {}) + if condition["media"].id not in gf_filter_att: + gf_filter_att[condition["media"].id] = {} + if condition["objective"] not in gf_filter_att[condition["media"].id]: + gf_filter_att[condition["media"].id][condition["objective"]] = {} + if ( + condition["threshold"] + not in gf_filter_att[condition["media"].id][condition["objective"]] + ): + gf_filter_att[condition["media"].id][condition["objective"]][ + condition["threshold"] + ] = {} + for item in new_filtered: + if ( + item[0].id + not in gf_filter_att[condition["media"].id][condition["objective"]][ + condition["threshold"] + ] + ): + gf_filter_att[condition["media"].id][condition["objective"]][ + condition["threshold"] + ][item[0].id] = {} + if ( + item[1] + not in gf_filter_att[condition["media"].id][condition["objective"]][ + condition["threshold"] + ][item[0].id] + ): + if len(item) < 3: + gf_filter_att[condition["media"].id][condition["objective"]][ + condition["threshold"] + ][item[0].id][item[1]] = None else: - item[0].lower_bound = original_bound[count] - if not self.test_single_condition(condition, False, currmodel): - item[0].lower_bound = 0 - if item not in filtered_list: - item.append(original_bound[count]) - item.append(self.score) - filtered_list.append(item) - count += 1 - toc = time.perf_counter() - print("Expansion time:", (toc - tic)) - print("Filtered count:", len(filtered_list), " out of ", len(reaction_list)) + gf_filter_att[condition["media"].id][condition["objective"]][ + condition["threshold"] + ][item[0].id][item[1]] = item[2] return filtered_list + ################################################################################# + # Functions for reaction set analysis + ################################################################################# + def analyze_minimal_reaction_set(self,solution,label,print_output=True): + """Systematically exploring alternative options for each reaction in an input minimal reaction set + + Parameters + ---------- + reaction_set : list + List of reactions to be evaluated for alternative options + print_output : bool + Prints output to stdout if true + + Returns + ------- + {obj reaction: list >} : list of reactions pointing to their alternative options + + Raises + ------ + """ + #Determining reaction set as the set of currently active reactions in the input solution + reaction_set = [] + output = {} + original_objective = self.model.objective + minimal_deviation_objective = self.model.problem.Objective(0, direction="min") + initial_zero_reactions = {} + obj_coef = dict() + scores = self.assign_reliability_scores_to_reactions() + for rxn in self.model.reactions: + if abs(solution.fluxes[rxn.id]) < 0.000000001: + initial_zero_reactions[rxn.id] = {">":True,"<":True} + obj_coef[rxn.forward_variable] = 1 + obj_coef[rxn.reverse_variable] = 1 + elif solution.fluxes[rxn.id] > 0.000000001 and rxn.lower_bound <= 0: + output[rxn.id] = [">",[]] + reaction_set.append([rxn,">",solution.fluxes[rxn.id],scores[rxn.id][">"],self.is_core(rxn)]) + initial_zero_reactions[rxn.id] = {"<":True} + obj_coef[rxn.reverse_variable] = 1 + elif solution.fluxes[rxn.id] < -0.000000001 and rxn.upper_bound >= 0: + output[rxn.id] = ["<",[]] + reaction_set.append([rxn,"<",solution.fluxes[rxn.id],scores[rxn.id]["<"],self.is_core(rxn)]) + initial_zero_reactions[rxn.id] = {">":True} + obj_coef[rxn.forward_variable] = 1 + self.model.objective = minimal_deviation_objective + minimal_deviation_objective.set_linear_coefficients(obj_coef) + #Knocking reactions out one at a time and checking for alternative options + for item in reaction_set: + original_bound = None + if item[1] == ">": + original_bound = item[0].upper_bound + item[0].upper_bound = 0 + else: + original_bound = item[0].lower_bound + item[0].lower_bound = 0 + new_solution = self.model.optimize() + result = {"alternatives":[],"coupled":[],"failed":False,"flux":item[2],"score":item[3],"core":item[4]} + output[item[0].id][1].append(result) + if new_solution.status == "optimal": + for secitem in reaction_set: + if secitem != item: + if abs(new_solution.fluxes[secitem[0].id]) < 0.000000001: + result["coupled"].append(secitem) + for rxn in self.model.reactions: + if rxn.id in initial_zero_reactions and abs(new_solution.fluxes[rxn.id]) > 0.000000001: + if new_solution.fluxes[rxn.id] > 0.000000001 and ">" in initial_zero_reactions[rxn.id]: + result["alternatives"].append([rxn,">"]) + elif new_solution.fluxes[rxn.id] < -0.000000001 and "<" in initial_zero_reactions[rxn.id]: + result["alternatives"].append([rxn,"<"]) + else: + result["failed"] = True + if original_bound != None: + if item[1] == ">": + item[0].upper_bound = original_bound + else: + item[0].lower_bound = original_bound + + self.model.objective = original_objective + #Printing output if requested + if print_output: + records = [] + for rxnid in output: + item = output[rxnid] + record = {"id":rxnid,"direction":item[0],"flux":item[1][0]["flux"],"score":item[1][0]["score"],"core":item[1][0]["core"],"equation":self.model.reactions.get_by_id(rxnid).build_reaction_string(use_metabolite_names=True),"coupled":"","alternatives":"","failed":item[1][0]["failed"]} + for subitem in item[1][0]["alternatives"]: + if len(record["alternatives"]): + record["alternatives"] += ";" + record["alternatives"] += subitem[1]+subitem[0].id+":"+subitem[0].build_reaction_string(use_metabolite_names=True) + for subitem in item[1][0]["coupled"]: + if len(record["coupled"]): + record["coupled"] += ";" + record["coupled"] += subitem[1]+subitem[0].id+":"+subitem[0].build_reaction_string(use_metabolite_names=True) + records.append(record) + df = pd.DataFrame.from_records(records) + df.to_csv("nboutput/rxn_analysis/"+label+"-min_rxn_set_analysis.csv",index=False) + return output + + ################################################################################# + # Functions related to biomass sensitivity analysis + ################################################################################# + def find_unproducible_biomass_compounds(self, target_rxn="bio1", ko_list=None): + # Cloning the model because we don't want to modify the original model with this analysis + tempmodel = cobra.io.json.from_json(cobra.io.json.to_json(self.model)) + # Getting target reaction and making sure it exists + if target_rxn not in tempmodel.reactions: + logger.critical(target_rxn + " not in model!") + return None + target_rxn_obj = tempmodel.reactions.get_by_id(target_rxn) + tempmodel.objective = target_rxn + original_objective = tempmodel.objective + pkgmgr = MSPackageManager.get_pkg_mgr(tempmodel) + rxn_list = [target_rxn, "rxn05294_c0", "rxn05295_c0", "rxn05296_c0"] + for rxn in rxn_list: + if rxn in tempmodel.reactions: + pkgmgr.getpkg("FlexibleBiomassPkg").build_package( + { + "bio_rxn_id": rxn, + "flex_coefficient": [0, 1], + "use_rna_class": None, + "use_dna_class": None, + "use_protein_class": None, + "use_energy_class": [0, 1], + "add_total_biomass_constraint": False, + } + ) + + # Creating min flex objective + min_flex_obj = tempmodel.problem.Objective(Zero, direction="min") + obj_coef = dict() + for reaction in tempmodel.reactions: + if reaction.id[0:5] == "FLEX_" or reaction.id[0:6] == "energy": + obj_coef[reaction.forward_variable] = 1 + obj_coef[reaction.reverse_variable] = 1 + # Temporarily setting flex objective so I can set coefficients + tempmodel.objective = min_flex_obj + min_flex_obj.set_linear_coefficients(obj_coef) + if not ko_list: + return self.run_biomass_dependency_test( + target_rxn_obj, tempmodel, original_objective, min_flex_obj, rxn_list + ) + else: + output = {} + for item in ko_list: + logger.debug("KO:" + item[0] + item[1]) + if item[0] not in output: + output[item[0]] = {} + if item[0] in tempmodel.reactions: + rxnobj = tempmodel.reactions.get_by_id(item[0]) + if item[1] == ">": + original_bound = rxnobj.upper_bound + rxnobj.upper_bound = 0 + output[item[0]][item[1]] = self.run_biomass_dependency_test( + target_rxn_obj, + tempmodel, + original_objective, + min_flex_obj, + rxn_list, + ) + rxnobj.upper_bound = original_bound + else: + original_bound = rxnobj.lower_bound + rxnobj.lower_bound = 0 + output[item[0]][item[1]] = self.run_biomass_dependency_test( + target_rxn_obj, + tempmodel, + original_objective, + min_flex_obj, + rxn_list, + ) + rxnobj.lower_bound = original_bound + else: + logger.info("Reaction "+item[0]+" not in model during sensitivity analysis!") + output[item[0]][item[1]] = [] + return output + + def run_biomass_dependency_test( + self, target_rxn, tempmodel, original_objective, min_flex_obj, rxn_list + ): + tempmodel.objective = original_objective + objective = tempmodel.slim_optimize() + if objective > 0: + target_rxn.lower_bound = 0.1 + tempmodel.objective = min_flex_obj + solution = tempmodel.optimize() + biocpds = [] + for reaction in tempmodel.reactions: + if reaction.id[0:5] == "FLEX_" and ( + reaction.forward_variable.primal > Zero + or reaction.reverse_variable.primal > Zero + ): + logger.debug("Depends on:" + reaction.id) + label = reaction.id[5:] + for item in rxn_list: + if label[0 : len(item)] == item: + biocpds.append(label[len(item) + 1 :]) + target_rxn.lower_bound = 0 + return biocpds + else: + logger.debug("Cannot grow") + return None + def add_atp_hydrolysis(self, compartment): # Searching for ATP hydrolysis compounds coefs = { diff --git a/modelseedpy/core/msprobability.py b/modelseedpy/core/msprobability.py new file mode 100644 index 00000000..a4f913a0 --- /dev/null +++ b/modelseedpy/core/msprobability.py @@ -0,0 +1,246 @@ +from cobrakbase.core.kbasefba.fbamodel_from_cobra import CobraModelConverter +from modelseedpy.fbapkg.mspackagemanager import MSPackageManager +from modelseedpy.community.mscommunity import MSCommunity +from cobrakbase.core.kbasefba.fbamodel import FBAModel +from cobra.io import write_sbml_model, read_sbml_model +from optlang import Objective +from json import load, dump +from os import path, mkdir +from cobra import Model +import re + + +def add_biomass_objective(megaModel, captured_rxnIDs): + if "bio1" in captured_rxnIDs: + megaModel.objective = Objective( + megaModel.reactions.bio1.flux_expression, direction="max" + ) + else: + # select the most conserved biomass composition + for rxn in megaModel.reactions: + if "biomass" and not "EX_" in rxn.id: + megaModel.objective = Objective(rxn.flux_expression, direction="max") + break + megaModel.solver.update() + return megaModel + + +class MSProbability: + + # TODO - add the parallelization code with an argument flag + @staticmethod + def megaModel( + clades_paths, kbase_api=None, reaction_counts_path=None, numTotal="numMembers",copy_genes=True + ): + # compute the reaction frequency of the models in a given clade + broken_models, megaModels = [], [] + # models_paths = glob(f"{models_path}/*.xml") + for clade, paths in clades_paths.items(): + print(clade+"1") + if not reaction_counts_path: + print(clade+"2") + if not path.exists("reaction_counts"): + mkdir("reaction_counts") + reaction_counts = {} + for index, model_path in enumerate(paths): + print( + f"{model_path}\tindex {index}\t\t\t\t\t\t\t\t\t\t\t\t", end="\r" + ) + try: + model = ( + read_sbml_model(model_path) + if not kbase_api + else kbase_api.get_from_ws(model_path) + ) + except Exception as e: + print("broken", e, model_path) + broken_models.append(model_path) + continue + # print(f"\n{len(model.reactions)} reactions", ) + for rxn in model.reactions: + if rxn.id in reaction_counts: + reaction_counts[rxn.id] += 1 + else: + reaction_counts[rxn.id] = 1 + # TODO storing a list of the rxn objects will save computational effort in the subsequent step + reaction_counts.update({numTotal: len(paths) - len(broken_models)}) + reaction_counts.update( + { + rxnID: (count / reaction_counts[numTotal]) + for rxnID, count in reaction_counts.items() + if rxnID != numTotal + } + ) + with open(f"reaction_counts/{clade}_reactions.json", "w") as jsonOut: + dump(reaction_counts, jsonOut, indent=3) + else: + try: + with open(f"{reaction_counts_path}/{clade}.json", "r") as jsonIn: + reaction_counts = load(jsonIn) + except: + print(f"broken model: {clade}") + continue + + # constructing the probabilistic clade model + megaModel = FBAModel( + { + "id": clade, + "name": f"MegaModel for {clade} from {reaction_counts[numTotal]} members", + } + ) + # megaModel = CobraModelConverter(Model(clade, name=f"MegaModel for {clade} from {reaction_counts[numTotal]} members")).build() + remaining_rxnIDs = set(list(reaction_counts.keys())) + captured_reactions, captured_rxnIDs = [], set() + + print("\n", clade) # , end="\t") + found_rxn_hash = {} + for model_path in paths: + print(f"{model_path}\t\t\t\t\t\t\t\t\t\t\t\t", end="\r") + try: + model = ( + read_sbml_model(model_path) + if not kbase_api + else kbase_api.get_from_ws(model_path) + ) + except Exception as e: + print("broken", e, model_path) + broken_models.append(model_path) + continue + for rxn in model.reactions: + if rxn.id not in found_rxn_hash: + found_rxn_hash[rxn.id] = {"genes":{},"rxn":rxn} + captured_reactions.append(rxn) + elif copy_genes: + for gene in rxn.genes: + if gene.id not in found_rxn_hash[rxn.id]: + found_rxn_hash[rxn.id]["genes"][gene.id] = 1 + if len(found_rxn_hash[rxn.id]["rxn"].gene_reaction_rule) > 0: + found_rxn_hash[rxn.id]["rxn"].gene_reaction_rule += f" or {gene.id}" + else: + found_rxn_hash[rxn.id]["rxn"].gene_reaction_rule = gene.id + if captured_reactions == []: + print(f"\tNo models for {clade} are defined.") + continue + ## add reactions + megaModel.add_reactions(list(captured_reactions)) + for rxn in megaModel.reactions: + rxn.notes["probability"] = reaction_counts[rxn.id] + ## add objective + megaModel = add_biomass_objective(megaModel, captured_rxnIDs) + ## evaluate the model and export + missingRxns = ( + set([rxnID for rxnID in reaction_counts]) + - set([rxn.id for rxn in megaModel.reactions]) + - {numTotal} + ) + if missingRxns != set(): + print("\nmissing reactions: ", missingRxns) + write_sbml_model(megaModel, clade+".xml") + megaModels.append(megaModel) + print("\tfinished") + return megaModels if len(clades_paths) > 1 else megaModels[0] + + @staticmethod + def apply_threshold(model, threshold=0.5): + for rxn in model.reactions: + if rxn.notes["probability"] < threshold: + rxn.lower_bound = rxn.upper_bound = 0 + return model + + # "MS2 - Probabilistic modeling" would create a probabilstic model and optionally an ensemble model from the probabilistic model + + # TODO - develop a separate App from + + # TODO - Construct another code to aggregate functions from all genomes into a single model, where the genes themselves would be mapped with a probability + ## only count genomes with SSOs + ## this would accelerate the construction of making a megaModel + ## specify an ANI cut-off and a closeness to the top-hitting genome + ## yield two models: augmented MAG model with only conserved functions and the probabilistic model with all functions + ## create the KBase module + GitHub repository, after Chris settles on a name + + # TODO - integrate the ensembleFBA modules + repositories + + # TODO - update the CommunityFBA update to run probabilistic models + + @staticmethod + def prFBA( + model_s_, + environment=None, + abundances=None, + min_prob=0.01, + prob_exp=1, + ex_weight=100, + commkinetics=None, + kinetics_coef=1000, + printLP=False, + expression=None + ): + from modelseedpy.community.commhelper import build_from_species_models + from modelseedpy.core.msmodelutl import MSModelUtil + from modelseedpy.fbapkg.elementuptakepkg import ElementUptakePkg + from optlang.symbolics import Zero + + # commkinetics = commkinetics if commkinetics is not None else len(model_s_) > 1 + mdlUtil = MSModelUtil( + model_s_ + if len(model_s_) == 1 + else build_from_species_models( + model_s_, abundances=abundances, commkinetics=commkinetics + ) + ) + if environment is not None: + mdlUtil.add_medium(environment) + # constrain carbon consumption and community composition + elepkg = ElementUptakePkg(mdlUtil.model) + elepkg.build_package({"C": 100}) + ## the total flux through the members proportional to their relative abundances + if not commkinetics and len(model_s_) > 1: + pkgmgr = MSPackageManager.get_pkg_mgr(mdlUtil.model) + MSCommObj = MSCommunity(mdlUtil.model, model_s_) + pkgmgr.getpkg("CommKineticPkg").build_package(kinetics_coef, MSCommObj) + + # constrain the model to 95% of the optimum growth + maxBioSol = mdlUtil.model.slim_optimize() + mdlUtil.add_minimal_objective_cons(maxBioSol * 0.95) + + # weight internal reactions based on their probabilities + ## minimize: sum_r^R ((1-probabilities^prob_exp_r)*flux_r + min_prob) + sum_ex^EX(ex_weight*EX) + coef = {} + for rxn in mdlUtil.model.reactions: + if "rxn" == rxn.id[0:3]: + coef.update( + { + rxn.forward_variable: max( + min_prob, (1 - float(rxn.notes["probability"]) ** prob_exp) + ) + } + ) + coef.update( + { + rxn.reverse_variable: max( + min_prob, (1 - float(rxn.notes["probability"]) ** prob_exp) + ) + } + ) + elif "EX_" == rxn.id[0:3]: + coef.update({rxn.forward_variable: ex_weight}) + coef.update({rxn.reverse_variable: ex_weight}) + mdlUtil.add_objective(Zero, "min", coef) + + print([cons.name for cons in mdlUtil.model.constraints]) + + if printLP: + with open("prFBA.lp", "w") as out: + out.write(str(mdlUtil.model.solver)) + + # simulate the probabilistic model with the respective probabilities + return mdlUtil.model.optimize() + + @staticmethod + def iterative_simulation(time_iterative_data): + pass + + def expressionData(data): + # iterate over the reactions, genes, and keep the highest expression score + # turn off reactions that are below a threshold, ensure that the growth is unchanged, otherwise restore the reaction. + pass diff --git a/modelseedpy/core/mstemplate.py b/modelseedpy/core/mstemplate.py index b1bb1975..20100a11 100644 --- a/modelseedpy/core/mstemplate.py +++ b/modelseedpy/core/mstemplate.py @@ -3,20 +3,28 @@ import copy import math from enum import Enum +import pandas as pd +import numpy as np from cobra.core import Metabolite, Reaction from cobra.core.dictlist import DictList from cobra.util import format_long_string +from modelseedpy.core.fbahelper import FBAHelper from modelseedpy.core.msmodel import ( get_direction_from_constraints, get_reaction_constraints_from_direction, get_cmp_token, ) from cobra.core.dictlist import DictList +from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union + +# from gevent.libev.corecext import self # from cobrakbase.kbase_object_info import KBaseObjectInfo logger = logging.getLogger(__name__) +SBO_ANNOTATION = "sbo" + class AttrDict(dict): """ @@ -35,6 +43,13 @@ class TemplateReactionType(Enum): GAPFILLING = "gapfilling" +class TemplateBiomassCoefficientType(Enum): + MOLFRACTION = "MOLFRACTION" + MOLSPLIT = "MOLSPLIT" + MULTIPLIER = "MULTIPLIER" + EXACT = "EXACT" + + class MSTemplateMetabolite: def __init__( self, @@ -129,7 +144,7 @@ class MSTemplateSpecies(Metabolite): def __init__( self, comp_cpd_id: str, - charge: int, + charge: float, compartment: str, cpd_id, max_uptake=0, @@ -146,20 +161,34 @@ def __init__( self.cpd_id ) - def to_metabolite(self, index="0"): + def to_metabolite(self, index="0", force=False): """ Create cobra.core.Metabolite instance :param index: compartment index + :@param force: force index :return: cobra.core.Metabolite """ if index is None: index = "" + index = str(index) + + if self.compartment == "e" and index.isnumeric(): + if force: + logger.warning( + f"Forcing numeric index [{index}] to extra cellular compartment not advised" + ) + else: + index = "0" + cpd_id = f"{self.id}{index}" compartment = f"{self.compartment}{index}" - name = f"{self.name}" - if len(str(index)) > 0: - name = f"{self.name} [{compartment}]" + if self.compound == None: + logger.critical( + f"Compound objective associated with [{cpd_id}] is missing from template" + ) + name = f"{self.compound.name} [{compartment}]" metabolite = Metabolite(cpd_id, self.formula, name, self.charge, compartment) + metabolite.notes["modelseed_template_id"] = self.id return metabolite @property @@ -169,8 +198,8 @@ def compound(self): @property def name(self): if self._template_compound: - return self._template_compound.name - return "" + return f"{self._template_compound.name} [{self.compartment}]" + return f"{self.id} [{self.compartment}]" @name.setter def name(self, value): @@ -279,15 +308,17 @@ def compartment(self): def to_reaction(self, model=None, index="0"): if index is None: index = "" + index = str(index) rxn_id = f"{self.id}{index}" compartment = f"{self.compartment}{index}" name = f"{self.name}" metabolites = {} for m, v in self.metabolites.items(): - if model and m.id in model.metabolites: - metabolites[model.metabolites.get_by_id(m.id)] = v + _metabolite = m.to_metabolite(index) + if _metabolite.id in model.metabolites: + metabolites[model.metabolites.get_by_id(_metabolite.id)] = v else: - metabolites[m.to_metabolite(index)] = v + metabolites[_metabolite] = v if len(str(index)) > 0: name = f"{self.name} [{compartment}]" @@ -295,6 +326,7 @@ def to_reaction(self, model=None, index="0"): rxn_id, name, self.subsystem, self.lower_bound, self.upper_bound ) reaction.add_metabolites(metabolites) + reaction.annotation["seed.reaction"] = self.reference_id return reaction @staticmethod @@ -411,7 +443,7 @@ def get_data(self): map(lambda x: "~/complexes/id/" + x.id, self.complexes) ), # 'status': self.status, - "type": self.type, + "type": self.type if type(self.type) is str else self.type.value, } # def build_reaction_string(self, use_metabolite_names=False, use_compartment_names=None): @@ -434,6 +466,453 @@ def get_data(self): # id=self.id, stoichiometry=self.build_reaction_string()) +class MSTemplateBiomassComponent: + def __init__( + self, + metabolite, + comp_class: str, + coefficient: float, + coefficient_type: str, + linked_metabolites, + ): + """ + :param metabolite:MSTemplateMetabolite + :param comp_class:string + :param coefficient:float + :param coefficient_type:string + :param linked_metabolites:{MSTemplateMetabolite:float} + """ + self.id = metabolite.id + "_" + comp_class + self.metabolite = metabolite + self.comp_class = comp_class + self.coefficient = coefficient + self.coefficient_type = coefficient_type + self.linked_metabolites = linked_metabolites + + @staticmethod + def from_dict(d, template): + met_id = d["templatecompcompound_ref"].split("/").pop() + metabolite = template.compcompounds.get_by_id(met_id) + linked_metabolites = {} + for count, item in enumerate(d["linked_compound_refs"]): + l_met_id = item.split("/").pop() + l_metabolite = template.compcompounds.get_by_id(l_met_id) + linked_metabolites[l_metabolite] = d["link_coefficients"][count] + self = MSTemplateBiomassComponent( + metabolite, + d["class"], + d["coefficient"], + d["coefficient_type"], + linked_metabolites, + ) + return self + + def get_data(self): + data = { + "templatecompcompound_ref": "~/compcompounds/id/" + self.metabolite.id, + "class": self.comp_class, + "coefficient": self.coefficient, + "coefficient_type": self.coefficient_type, + "linked_compound_refs": [], + "link_coefficients": [], + } + for met in self.linked_metabolites: + data["linked_compound_refs"].append("~/compcompounds/id/" + met.id) + data["link_coefficients"].append(self.linked_metabolites[met]) + return data + + +class MSTemplateBiomass: + def __init__( + self, + biomass_id: str, + name: str, + type: str, + dna: float = 0, + rna: float = 0, + protein: float = 0, + lipid: float = 0, + cellwall: float = 0, + cofactor: float = 0, + pigment: float = 0, + carbohydrate: float = 0, + energy: float = 0, + other: float = 0 + ): + """ + + :param biomass_id:string + :param name:string + :param type:string + :param dna:float + :param rna:float + :param protein:float + :param lipid:float + :param cellwall:float + :param cofactor:float + :param pigment:float + :param carbohydrate:float + :param energy:float + :param other:float + """ + self.id = biomass_id + self.name = name + self.type = type + self.dna = dna + self.rna = rna + self.protein = protein + self.lipid = lipid + self.cellwall = cellwall + self.cofactor = cofactor + self.pigment = pigment + self.carbohydrate = carbohydrate + self.energy = energy + self.other = other + self.templateBiomassComponents = DictList() + self._template = None + + @staticmethod + def from_table( + filename_or_df, + template, + bio_id, + name, + type, + dna, + rna, + protein, + lipid, + cellwall, + cofactor, + pigment, + carbohydrate, + energy, + other, + ): + self = MSTemplateBiomass( + bio_id, + name, + type, + dna, + rna, + protein, + lipid, + cellwall, + cofactor, + pigment, + carbohydrate, + energy, + other, + ) + if isinstance(filename_or_df, str): + filename_or_df = pd.read_table(filename_or_df) + for index, row in filename_or_df.iterrows(): + if "biomass_id" not in row: + row["biomass_id"] = "bio1" + if row["biomass_id"] == bio_id: + if "compartment" not in row: + row["compartment"] = "c" + metabolite = template.compcompounds.get_by_id( + f'{row["id"]}_{row["compartment"].lower()}' + ) + linked_mets = {} + if ( + isinstance(row["linked_compounds"], str) + and len(row["linked_compounds"]) > 0 + ): + array = row["linked_compounds"].split("|") + for item in array: + sub_array = item.split(":") + l_met = template.compcompounds.get_by_id( + f'{sub_array[0]}_{row["compartment"].lower()}' + ) + linked_mets[l_met] = float(sub_array[1]) + self.add_biomass_component( + metabolite, + row["class"].lower(), + float(row["coefficient"]), + row["coefficient_type"].upper(), + linked_mets, + ) + return self + + @staticmethod + def from_dict(d, template): + self = MSTemplateBiomass( + d["id"], + d["name"], + d["type"], + d.get("dna", 0), + d.get("rna", 0), + d.get("protein", 0), + d.get("lipid", 0), + d.get("cellwall", 0), + d.get("cofactor", 0), + d.get("pigment", 0), + d.get("carbohydrate", 0), + d.get("energy", 0), + d.get("other", 0) + ) + for item in d["templateBiomassComponents"]: + biocomp = MSTemplateBiomassComponent.from_dict(item, template) + self.templateBiomassComponents.add(biocomp) + self._template = template + return self + + def add_biomass_component( + self, metabolite, comp_class, coefficient, coefficient_type, linked_mets={} + ): + biocomp = MSTemplateBiomassComponent( + metabolite, comp_class, coefficient, coefficient_type, linked_mets + ) + self.templateBiomassComponents.add(biocomp) + + def get_or_create_metabolite(self, model, baseid, compartment=None, index=None): + fullid = baseid + if compartment: + fullid += "_" + compartment + tempid = fullid + if index: + fullid += index + if fullid in model.metabolites: + return model.metabolites.get_by_id(fullid) + if tempid in self._template.compcompounds: + met = self._template.compcompounds.get_by_id(tempid).to_metabolite(index) + model.add_metabolites([met]) + return met + logger.error( + "Could not find biomass metabolite [%s] in model or template!", + fullid, + ) + + def get_or_create_reaction(self, model, baseid, compartment=None, index=None): + logger.debug(f"{baseid}, {compartment}, {index}") + fullid = baseid + if compartment: + fullid += "_" + compartment + tempid = fullid + if index: + fullid += index + if fullid in model.reactions: + return model.reactions.get_by_id(fullid) + if tempid in self._template.reactions: + rxn = self._template.reactions.get_by_id(tempid).to_reaction(model, index) + model.add_reactions([rxn]) + return rxn + newrxn = Reaction(fullid, fullid, "biomasses", 0, 1000) + model.add_reactions(newrxn) + return newrxn + + def build_biomass(self, model, index="0", classic=False, GC=0.5, add_to_model=True): + types = [ + "cofactor", + "pigment", + "carbohydrate", + "lipid", + "cellwall", + "protein", + "dna", + "rna", + "energy", + "other", + "pigment", + "carbohydrate" + ] + type_abundances = { + "cofactor": self.cofactor, + "pigment": self.pigment, + "carbohydrate": self.carbohydrate, + "lipid": self.lipid, + "cellwall": self.cellwall, + "protein": self.protein, + "dna": self.dna, + "rna": self.rna, + "energy": self.energy, + "pigment": self.pigment, + "carbohydrate": self.carbohydrate, + } + # Creating biomass reaction object + metabolites = {} + biorxn = Reaction(self.id, self.name, "biomasses", 0, 1000) + # Adding standard compounds for DNA, RNA, protein, and biomass + specific_reactions = {"dna": None, "rna": None, "protein": None} + exclusions = {"cpd17041_c": 1, "cpd17042_c": 1, "cpd17043_c": 1} + if not classic and self.dna > 0: + met = self.get_or_create_metabolite(model, "cpd11461", "c", index) + specific_reactions["dna"] = self.get_or_create_reaction( + model, "rxn05294", "c", index + ) + specific_reactions["dna"].name = "DNA synthesis" + if "rxn13783_c" + index in model.reactions: + specific_reactions[ + "dna" + ].gene_reaction_rule = model.reactions.get_by_id( + "rxn13783_c" + index + ).gene_reaction_rule + specific_reactions["dna"].notes[ + "modelseed_complex" + ] = model.reactions.get_by_id("rxn13783_c" + index).notes[ + "modelseed_complex" + ] + model.remove_reactions( + [model.reactions.get_by_id("rxn13783_c" + index)] + ) + specific_reactions["dna"].subtract_metabolites( + specific_reactions["dna"].metabolites + ) + specific_reactions["dna"].add_metabolites({met: 1}) + metabolites[met] = 1 + metabolites[met] = -1 * self.dna + if not classic and self.protein > 0: + met = self.get_or_create_metabolite(model, "cpd11463", "c", index) + specific_reactions["protein"] = self.get_or_create_reaction( + model, "rxn05296", "c", index + ) + specific_reactions["protein"].name = "Protein synthesis" + if "rxn13782_c" + index in model.reactions: + specific_reactions[ + "protein" + ].gene_reaction_rule = model.reactions.get_by_id( + "rxn13782_c" + index + ).gene_reaction_rule + specific_reactions["protein"].notes[ + "modelseed_complex" + ] = model.reactions.get_by_id("rxn13782_c" + index).notes[ + "modelseed_complex" + ] + model.remove_reactions( + [model.reactions.get_by_id("rxn13782_c" + index)] + ) + specific_reactions["protein"].subtract_metabolites( + specific_reactions["protein"].metabolites + ) + specific_reactions["protein"].add_metabolites({met: 1}) + metabolites[met] = -1 * self.protein + if not classic and self.rna > 0: + met = self.get_or_create_metabolite(model, "cpd11462", "c", index) + specific_reactions["rna"] = self.get_or_create_reaction( + model, "rxn05295", "c", index + ) + specific_reactions["rna"].name = "mRNA synthesis" + if "rxn13784_c" + index in model.reactions: + specific_reactions[ + "rna" + ].gene_reaction_rule = model.reactions.get_by_id( + "rxn13784_c" + index + ).gene_reaction_rule + specific_reactions["rna"].notes[ + "modelseed_complex" + ] = model.reactions.get_by_id("rxn13784_c" + index).notes[ + "modelseed_complex" + ] + model.remove_reactions( + [model.reactions.get_by_id("rxn13784_c" + index)] + ) + specific_reactions["rna"].subtract_metabolites( + specific_reactions["rna"].metabolites + ) + specific_reactions["rna"].add_metabolites({met: 1}) + metabolites[met] = -1 * self.rna + bio_type_hash = {} + for type in types: + for comp in self.templateBiomassComponents: + if comp.metabolite.id in exclusions and not classic: + pass + elif type == comp.comp_class: + met = self.get_or_create_metabolite( + model, comp.metabolite.id, None, index + ) + if type not in bio_type_hash: + bio_type_hash[type] = {"items": [], "total_mw": 0} + if FBAHelper.metabolite_mw(met): + bio_type_hash[type]["total_mw"] += ( + -1 * FBAHelper.metabolite_mw(met) * comp.coefficient / 1000 + ) + bio_type_hash[type]["items"].append(comp) + for type in bio_type_hash: + for comp in bio_type_hash[type]["items"]: + coef = None + if ( + comp.coefficient_type == "MOLFRACTION" + or comp.coefficient_type == "MOLSPLIT" + ): + coef = ( + type_abundances[type] / bio_type_hash[type]["total_mw"] + ) * comp.coefficient + elif comp.coefficient_type == "MULTIPLIER": + coef = type_abundances[type] * comp.coefficient + elif comp.coefficient_type == "EXACT": + coef = comp.coefficient + elif comp.coefficient_type == "AT": + coef = ( + 2 + * comp.coefficient + * (1 - GC) + * (type_abundances[type] / bio_type_hash[type]["total_mw"]) + ) + elif comp.coefficient_type == "GC": + coef = ( + 2 + * comp.coefficient + * GC + * (type_abundances[type] / bio_type_hash[type]["total_mw"]) + ) + if coef: + met = model.metabolites.get_by_id(comp.metabolite.id + index) + if type not in ("dna", "protein", "rna") or classic: + if met in metabolites: + metabolites[met] += coef + else: + metabolites[met] = coef + elif not classic: + coef = coef / type_abundances[type] + specific_reactions[type].add_metabolites({met: coef}) + for l_met in comp.linked_metabolites: + met = self.get_or_create_metabolite( + model, l_met.id, None, index + ) + if type not in ("dna", "protein", "rna") or classic: + if met in metabolites: + metabolites[met] += ( + coef * comp.linked_metabolites[l_met] + ) + else: + metabolites[met] = coef * comp.linked_metabolites[l_met] + elif not classic: + specific_reactions[type].add_metabolites( + {met: coef * comp.linked_metabolites[l_met]} + ) + biorxn.annotation[SBO_ANNOTATION] = "SBO:0000629" + biorxn.add_metabolites(metabolites) + if add_to_model: + if biorxn.id in model.reactions: + model.remove_reactions([biorxn.id]) + model.add_reactions([biorxn]) + return biorxn + + def get_data(self): + data = { + "id": self.id, + "name": self.name, + "type": self.type, + "dna": self.dna, + "rna": self.rna, + "protein": self.protein, + "lipid": self.lipid, + "cellwall": self.cellwall, + "cofactor": self.cofactor, + "pigment": self.pigment, + "carbohydrate": self.carbohydrate, + "energy": self.energy, + "other": self.other, + "templateBiomassComponents": [], + } + for comp in self.templateBiomassComponents: + data["templateBiomassComponents"].append(comp.get_data()) + + return data + + class NewModelTemplateRole: def __init__(self, role_id, name, features=None, source="", aliases=None): """ @@ -655,6 +1134,68 @@ def __init__( self.complexes = DictList() self.pathways = DictList() self.subsystems = DictList() + self.drains = None + + ################# Replaces biomass reactions from an input TSV table ############################ + def overwrite_biomass_from_table( + self, + filename_or_df, + bio_id, + name, + type, + dna, + rna, + protein, + lipid, + cellwall, + cofactor, + pigment, + carbohydrate, + energy, + other, + ): + if isinstance(filename_or_df, str): + filename_or_df = pd.read_table(filename_or_df) + newbio = MSTemplateBiomass.from_table( + filename_or_df, + self, + bio_id, + name, + type, + dna, + rna, + protein, + lipid, + cellwall, + cofactor, + pigment, + carbohydrate, + energy, + other, + ) + if newbio.id in self.biomasses: + self.biomasses.remove(newbio.id) + self.biomasses.add(newbio) + + def add_drain(self, compound_id, lower_bound, upper_bound): + if compound_id not in self.compcompounds: + raise ValueError(f"{compound_id} not in template") + if lower_bound > upper_bound: + raise ValueError( + f"lower_bound: {lower_bound} must not be > than upper_bound: {upper_bound}" + ) + if self.drains is None: + self.drains = {} + self.drains[self.compcompounds.get_by_id(compound_id)] = ( + lower_bound, + upper_bound, + ) + + def add_sink(self, compound_id, default_upper_bound=1000): + self.add_drain(compound_id, 0, default_upper_bound) + + def add_demand(self, compound_id, default_lower_bound=-1000): + self.add_drain(compound_id, default_lower_bound, 0) def add_compartments(self, compartments: list): """ @@ -761,6 +1302,24 @@ def add_comp_compounds(self, comp_compounds: list): x._template_compound.species.add(x) self.compcompounds += comp_compounds + def add_biomasses(self, biomasses: list): + """ + Add biomasses to the template + :param biomasses: + :return: + """ + duplicates = list(filter(lambda x: x.id in self.biomasses, biomasses)) + if len(duplicates) > 0: + logger.error( + "unable to add biomasses [%s] already present in the template", + duplicates, + ) + return None + + for x in biomasses: + x._template = self + self.biomasses += biomasses + def add_reactions(self, reaction_list: list): """ @@ -789,7 +1348,9 @@ def add_reactions(self, reaction_list: list): if cpx.id not in self.complexes: self.add_complexes([cpx]) complex_replace.add(self.complexes.get_by_id(cpx.id)) + x._metabolites = metabolites_replace + x._update_awareness() x.complexes = complex_replace self.reactions += reaction_list @@ -858,7 +1419,7 @@ def get_data(self): } NewModelTemplate; """ - return { + d = { "__VERSION__": self.__VERSION__, "id": self.id, "name": self.name, @@ -871,11 +1432,16 @@ def get_data(self): "roles": list(map(lambda x: x.get_data(), self.roles)), "complexes": list(map(lambda x: x.get_data(), self.complexes)), "reactions": list(map(lambda x: x.get_data(), self.reactions)), - "biomasses": list(self.biomasses), + "biomasses": list(map(lambda x: x.get_data(), self.biomasses)), "pathways": [], "subsystems": [], } + if self.drains is not None: + d["drain_list"] = {c.id: t for c, t in self.drains.items()} + + return d + def _repr_html_(self): """ taken from cobra.core.Model :) @@ -918,7 +1484,73 @@ def _repr_html_(self): num_roles=len(self.roles), num_complexes=len(self.complexes), ) + + def remove_reactions( + self, + reactions: Union[str, Reaction, List[Union[str, Reaction]]], + remove_orphans: bool = False, + ) -> None: + """Remove reactions from the template. + + The change is reverted upon exit when using the model as a context. + + Parameters + ---------- + reactions : list or reaction or str + A list with reactions (`cobra.Reaction`), or their id's, to remove. + Reaction will be placed in a list. Str will be placed in a list and used to + find the reaction in the model. + remove_orphans : bool, optional + Remove orphaned genes and metabolites from the model as + well (default False). + """ + if isinstance(reactions, str) or hasattr(reactions, "id"): + warn("need to pass in a list") + reactions = [reactions] + for reaction in reactions: + # Make sure the reaction is in the model + try: + reaction = self.reactions[self.reactions.index(reaction)] + except ValueError: + warn(f"{reaction} not in {self}") + + else: + self.reactions.remove(reaction) + + """ for met in reaction._metabolites: + if reaction in met._reaction: + met._reaction.remove(reaction) + if context: + context(partial(met._reaction.add, reaction)) + if remove_orphans and len(met._reaction) == 0: + self.remove_metabolites(met) + + for gene in reaction._genes: + if reaction in gene._reaction: + gene._reaction.remove(reaction) + if context: + context(partial(gene._reaction.add, reaction)) + + if remove_orphans and len(gene._reaction) == 0: + self.genes.remove(gene) + if context: + context(partial(self.genes.add, gene)) + + # remove reference to the reaction in all groups + associated_groups = self.get_associated_groups(reaction) + for group in associated_groups: + group.remove_members(reaction) """ + + #*************************Curation Functions************************* + def auto_fix_protons(self): + for rxn in self.reactions: + mb = rxn.check_mass_balance() + if 'charge' in mb and mb.get('H') == mb.get('charge'): + print(f'auto fix charge for {rxn.id}') + rxn.add_metabolites({ + self.compcompounds.cpd00067_c: -1 * mb['charge'] + }) class MSTemplateBuilder: def __init__( @@ -948,6 +1580,7 @@ def __init__( self.reactions = [] self.info = info self.biochemistry_ref = None + self.drains = {} @staticmethod def from_dict(d, info=None, args=None): @@ -969,6 +1602,7 @@ def from_dict(d, info=None, args=None): builder.reactions = d["reactions"] builder.biochemistry_ref = d["biochemistry_ref"] builder.biomasses = d["biomasses"] + return builder @staticmethod @@ -1074,7 +1708,12 @@ def build(self): ) ) template.biomasses += list( - map(lambda x: AttrDict(x), self.biomasses) - ) # TODO: biomass object + list( + map(lambda x: MSTemplateBiomass.from_dict(x, template), self.biomasses) + ) + ) + + for compound_id, (lb, ub) in self.drains.items(): + template.add_drain(compound_id, lb, ub) return template diff --git a/modelseedpy/core/optlanghelper.py b/modelseedpy/core/optlanghelper.py new file mode 100644 index 00000000..b616ab90 --- /dev/null +++ b/modelseedpy/core/optlanghelper.py @@ -0,0 +1,190 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Aug 18 10:26:32 2022 + +@author: Andrew Freiburger +""" +from collections import namedtuple +from optlang import Model +from typing import Iterable, Union +from pprint import pprint +import logging + +logger = logging.getLogger(__name__) + +Bounds = namedtuple("Bounds", ("lb", "ub"), defaults=(0, 1000)) +tupVariable = namedtuple( + "tupVariable", + ("name", "bounds", "type"), + defaults=("varName", Bounds(), "continuous"), +) +tupConstraint = namedtuple( + "tupConstraint", + ("name", "bounds", "expr"), + defaults=("consName", Bounds(0, 0), None), +) +tupObjective = namedtuple( + "tupObjective", + ("name", "expr", "direction"), + defaults=("objectiveName", None, "max"), +) + + +def isIterable(term): + try: + iter(term) + if type(term) is not str: + return True + return False + except: + return False + + +def isnumber(obj): + try: + float(obj) + return True + except: + return False + + +def define_term(value): + if isnumber(value): + return {"type": "Number", "value": value} + if isinstance(value, str): + return {"type": "Symbol", "name": value} + print(f"ERROR: The {value} of type {type(value)} is not known.") + + +def get_expression_template(expr): + # print(expr) + if isinstance(expr, list): + return {"type": "Add", "args": []} + return {"type": expr["operation"], "args": []} + + +class OptlangHelper: + + @staticmethod + def add_variables( + var_name: str, var_bounds: (list, tuple), var_type: str = "continuous" + ): + return { + "name": var_name.replace(" ", "_"), + "lb": var_bounds[0], + "ub": var_bounds[1], + "type": var_type, + } + + @staticmethod + def add_constraint(cons_name: str, cons_bounds: (list, tuple), cons_expr: dict): + return { + "name": cons_name.replace(" ", "_"), + "expression": OptlangHelper._define_expression(cons_expr), + "lb": cons_bounds[0], + "ub": cons_bounds[1], + "indicator_variable": None, + "active_when": 1, + } + + @staticmethod + def add_objective(obj_name: str, objective_expr: Union[dict, list], direction: str): + if isinstance(objective_expr, list): + obj_expr = { + "type": "Add", + "args": [ + OptlangHelper._define_expression(expr) for expr in objective_expr + ], + } + elif isinstance(objective_expr, dict): + obj_expr = { + "type": objective_expr["operation"], + "args": [define_term(term) for term in objective_expr["elements"]], + } + return { + "name": obj_name.replace(" ", "_"), + "expression": obj_expr, + "direction": direction, + } + + @staticmethod + def define_model(model_name, variables, constraints, objective, optlang=False): + model = {"name": model_name, "variables": [], "constraints": []} + # pprint(objective) + for var in variables: + if len(var) == 2: + var.append("continuous") + model["variables"].append( + OptlangHelper.add_variables(var[0], var[1], var[2]) + ) + for cons in constraints: + model["constraints"].append( + OptlangHelper.add_constraint(cons[0], cons[1], cons[2]) + ) + # if not isinstance(obj, str): # catches a strange error of the objective name as the objective itself + model["objective"] = OptlangHelper.add_objective( + objective[0], objective[1], objective[2] + ) + if optlang: + return Model.from_json(model) + return model + + @staticmethod + def _define_expression(expr: dict): + expression = get_expression_template(expr) + level1_coef = 0 + for ele in expr["elements"]: + if not isnumber(ele) and not isinstance(ele, str): + # print(expr, ele, end="\r") + arguments = [] + level2_coef = 0 + for ele2 in ele["elements"]: + if not isnumber(ele2) and not isinstance(ele2, str): + # print("recursive ele\t\t", type(ele2), ele2) + arguments.append(OptlangHelper._define_expression(ele2)) + elif isinstance(ele2, str): + arguments.append(define_term(ele2)) + else: + level2_coef += float(ele2) + expression["args"].append(get_expression_template(ele)) + if level2_coef != 0: + arguments.append(define_term(level2_coef)) + expression["args"][-1]["args"] = arguments + elif isinstance(ele, str): + expression["args"].append(define_term(ele)) + else: + level1_coef += float(ele) + if level1_coef != 0: + expression["args"].append(define_term(level1_coef)) + # pprint(expression) + return expression + + @staticmethod + def dot_product(zipped_to_sum, heuns_coefs=None): + # ensure that the lengths are compatible for heun's dot-products + if heuns_coefs is not None: + coefs = ( + heuns_coefs + if isinstance(heuns_coefs, (list, set)) + else heuns_coefs.tolist() + ) + zipped_length = len(zipped_to_sum) + coefs_length = len(coefs) + if zipped_length != coefs_length: + raise IndexError( + f"ERROR: The length of zipped elements {zipped_length}" + f" is unequal to that of coefficients {coefs_length}" + ) + + elements = [] + for index, (term1, term2) in enumerate(zipped_to_sum): + if heuns_coefs is not None: + elements.extend( + [ + {"operation": "Mul", "elements": [heuns_coefs[index], term1]}, + {"operation": "Mul", "elements": [heuns_coefs[index], term2]}, + ] + ) + else: + elements.append({"operation": "Mul", "elements": [term1, term2]}) + return elements diff --git a/modelseedpy/core/rast_client.py b/modelseedpy/core/rast_client.py index 575cf0d4..cc36b7bb 100644 --- a/modelseedpy/core/rast_client.py +++ b/modelseedpy/core/rast_client.py @@ -52,17 +52,15 @@ def __init__(self): ) self.stages = [ {"name": "annotate_proteins_kmer_v2", "kmer_v2_parameters": {}}, - { - "name": "annotate_proteins_kmer_v1", - "kmer_v1_parameters": {"annotate_hypothetical_only": 1}, - }, + # {"name": "annotate_proteins_kmer_v1", + # "kmer_v1_parameters": {"annotate_hypothetical_only": 1},}, { "name": "annotate_proteins_similarity", "similarity_parameters": {"annotate_hypothetical_only": 1}, }, ] - def annotate_genome(self, genome): + def annotate_genome(self, genome, split_terms=True): p_features = [] for f in genome.features: if f.seq and len(f.seq) > 0: @@ -72,9 +70,13 @@ def annotate_genome(self, genome): for o in res[0]["features"]: feature = genome.features.get_by_id(o["id"]) if "function" in o: - functions = re.split("; | / | @ | => ", o["function"]) - for function in functions: - feature.add_ontology_term("RAST", function) + rast_function = o["function"] + if split_terms: + functions = re.split("; | / | @", rast_function) + for function in functions: + feature.add_ontology_term("RAST", function) + else: + feature.add_ontology_term("RAST", rast_function) return res[0]["analysis_events"] @@ -84,6 +86,14 @@ def annotate_genome_from_fasta(self, filepath, split="|"): return genome, res + def annotate_protein_sequence(self, protein_id: str, protein_seq: str): + p_features = [{"id": protein_id, "protein_translation": protein_seq}] + return self.f(p_features) + + def annotate_protein_sequences(self, protein_seqs: dict): + p_features = [{"id": protein_id, "protein_translation": protein_seq}] + return self.f(p_features) + def f1(self, protein_id, protein_seq): p_features = [{"id": protein_id, "protein_translation": protein_seq}] return self.f(p_features) diff --git a/modelseedpy/core/rpcclient.py b/modelseedpy/core/rpcclient.py old mode 100644 new mode 100755 diff --git a/modelseedpy/data/FBAReportTemplate.html b/modelseedpy/data/FBAReportTemplate.html new file mode 100644 index 00000000..2ccad425 --- /dev/null +++ b/modelseedpy/data/FBAReportTemplate.html @@ -0,0 +1,213 @@ + + + + + Community FBA + + + + + + +
+ + + + + diff --git a/modelseedpy/data/ModelReportTemplate.html b/modelseedpy/data/ModelReportTemplate.html new file mode 100644 index 00000000..cab60a0b --- /dev/null +++ b/modelseedpy/data/ModelReportTemplate.html @@ -0,0 +1,349 @@ + + + + + ModelSEED Reconstruction + + + + + + +
+ + + + + diff --git a/modelseedpy/data/atp_medias.tsv b/modelseedpy/data/atp_medias.tsv new file mode 100644 index 00000000..53d15048 --- /dev/null +++ b/modelseedpy/data/atp_medias.tsv @@ -0,0 +1,34 @@ +seed Glc.O2 Ac.O2 Etho.O2 Pyr.O2 Glyc.O2 Fum.O2 Succ.O2 Akg.O2 LLac.O2 Dlac.O2 For.O2 Glc Ac Etho Pyr Glyc Fum Succ Akg Llac Dlac For mal-L For.NO2 For.NO3 For.NO Pyr.NO2 Pyr.NO3 Pyr.NO Ac.NO2 Ac.NO3 Ac.NO Glc.DMSO Glc.TMAO Pyr.DMSO Pyr.TMAO Pyr.SO4 Pyr.SO3 H2.CO2 H2.Ac For.SO4.H2 LLac.SO4.H2 For.SO4 LLac.SO4 H2.SO4 empty Light ANME Methane Methanol Methanol.H2 Methanamine.H2 Dimethylamine.H2 Trimethylamine.H2 +EX_cpd00027_e0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00024_e0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00106_e0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00036_e0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00137_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00130_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00159_e0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 +EX_cpd00221_e0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00020_e0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00100_e0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00363_e0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00029_e0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00047_e0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00204_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00011_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00007_e0 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd11640_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000 1000 1000 1000 0 0 1000 0 0 0 0 0 1000 1000 1000 1000 +EX_cpd00418_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000 0 0 1000 0 0 1000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00209_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000 0 0 1000 0 0 1000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00075_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000 0 0 1000 0 0 1000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00659_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00528_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd08021_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000 0 1000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00811_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000 0 1000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd00048_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000 0 0 0 1000 1000 1000 1000 1000 0 0 0 0 0 0 0 0 0 +EX_cpd00081_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +EX_cpd11632_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 +EX_cpd08701_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000 0 0 0 0 0 0 +EX_cpd01024_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 +EX_cpd00116_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 +EX_cpd00187_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 +EX_cpd00425_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 +EX_cpd00441_e0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 diff --git a/modelseedpy/fbapkg/__init__.py b/modelseedpy/fbapkg/__init__.py index 6f67c85f..4056490f 100644 --- a/modelseedpy/fbapkg/__init__.py +++ b/modelseedpy/fbapkg/__init__.py @@ -17,5 +17,8 @@ from modelseedpy.fbapkg.problemreplicationpkg import ProblemReplicationPkg from modelseedpy.fbapkg.fullthermopkg import FullThermoPkg from modelseedpy.fbapkg.objconstpkg import ObjConstPkg +from modelseedpy.fbapkg.objectivepkg import ObjectivePkg from modelseedpy.fbapkg.changeoptpkg import ChangeOptPkg from modelseedpy.fbapkg.elementuptakepkg import ElementUptakePkg +from modelseedpy.fbapkg.expressionactivationpkg import ExpressionActivationPkg +from modelseedpy.fbapkg.reactionactivationpkg import ReactionActivationPkg diff --git a/modelseedpy/fbapkg/basefbapkg.py b/modelseedpy/fbapkg/basefbapkg.py index 4d0c613c..f97b1aed 100644 --- a/modelseedpy/fbapkg/basefbapkg.py +++ b/modelseedpy/fbapkg/basefbapkg.py @@ -3,13 +3,20 @@ from __future__ import absolute_import import logging -import re -from optlang.symbolics import Zero, add -import json as _json -from cobra.core import Gene, Metabolite, Model, Reaction +import re # !!! import is never used +from optlang.symbolics import Zero, add # !!! add is never used +import json as _json # !!! import is never used +from cobra.core import ( + Gene, + Metabolite, + Model, + Reaction, +) # !!! none of these imports are used from modelseedpy.fbapkg.mspackagemanager import MSPackageManager from modelseedpy.core.msmodelutl import MSModelUtil +from modelseedpy.core.exceptions import FeasibilityError +logger = logging.getLogger(__name__) # Adding a few exception classes to handle different types of errors class FeasibilityError(Exception): @@ -26,17 +33,26 @@ class BaseFBAPkg: def __init__( self, model, name, variable_types={}, constraint_types={}, reaction_types={} ): - self.model = model - self.modelutl = MSModelUtil(model) + if isinstance(model, MSModelUtil): + self.model = model.model + self.modelutl = model + else: + self.model = model + self.modelutl = MSModelUtil.get(model) + self.name = name + self.pkgmgr = MSPackageManager.get_pkg_mgr(model) if self.pkgmgr is None: self.pkgmgr = MSPackageManager.get_pkg_mgr(model, 1) self.pkgmgr.addpkgobj(self) - self.constraints = dict() - self.variables = dict() - self.parameters = dict() - self.new_reactions = dict() + + self.constraints, self.variables, self.parameters, self.new_reactions = ( + {}, + {}, + {}, + {}, + ) self.variable_types = variable_types self.constraint_types = constraint_types @@ -45,6 +61,18 @@ def __init__( for type in constraint_types: self.constraints[type] = dict() + for constraint in self.model.solver.constraints: + obj_type = constraint.name.split("_")[-1] + if obj_type in self.constraints: + name = "_".join(constraint.name.split("_")[0:-1]) + self.constraints[obj_type][name] = constraint + + for variable in self.model.solver.variables: + obj_type = variable.name.split("_")[-1] + if obj_type in self.variables: + name = "_".join(variable.name.split("_")[0:-1]) + self.variables[obj_type][name] = variable + def validate_parameters(self, params, required, defaults): for item in required: if item not in params: @@ -53,53 +81,72 @@ def validate_parameters(self, params, required, defaults): self.parameters.update(params) # replace defaults with params def clear(self): - objects = [] - for type in self.variables: - for object in self.variables[type]: - objects.append(self.variables[type][object]) - for type in self.constraints: - for object in self.constraints[type]: - objects.append(self.constraints[type][object]) - self.model.remove_cons_vars(objects) - self.variables = {} - self.constraints = {} - - def build_variable(self, type, lower_bound, upper_bound, vartype, object=None): + cobra_objs = [] + for obj_type in self.variables: + for cobra_obj in self.variables[obj_type]: + cobra_objs.append(self.variables[obj_type][cobra_obj]) + self.variables[obj_type] = {} + for obj_type in self.constraints: + for cobra_obj in self.constraints[obj_type]: + cobra_objs.append(self.constraints[obj_type][cobra_obj]) + self.constraints[obj_type] = {} + self.model.remove_cons_vars(cobra_objs) + + def build_variable( + self, obj_type, lower_bound, upper_bound, vartype, cobra_obj=None + ): name = None - if self.variable_types[type] == "none": - count = len(self.variables[type]) + if self.variable_types[obj_type] == "none": + count = len(self.variables[obj_type]) name = str(count + 1) - elif self.variable_types[type] == "string": - name = object + elif self.variable_types[obj_type] == "string": + name = cobra_obj else: - name = object.id - if name not in self.variables[type]: - self.variables[type][name] = self.model.problem.Variable( - name + "_" + type, lb=lower_bound, ub=upper_bound, type=vartype + name = cobra_obj.id + if name not in self.variables[obj_type]: + self.variables[obj_type][name] = self.model.problem.Variable( + name + "_" + obj_type, lb=lower_bound, ub=upper_bound, type=vartype ) - self.model.add_cons_vars(self.variables[type][name]) - return self.variables[type][name] + self.model.add_cons_vars(self.variables[obj_type][name]) + return self.variables[obj_type][name] - def build_constraint(self, type, lower_bound, upper_bound, coef={}, object=None): + def build_constraint( + self, obj_type, lower_bound, upper_bound, coef={}, cobra_obj=None + ): name = None - if self.constraint_types[type] == "none": - count = len(self.constraints[type]) + if self.constraint_types[obj_type] == "none": + count = len(self.constraints[obj_type]) name = str(count + 1) - elif self.constraint_types[type] == "string": - name = object + elif self.constraint_types[obj_type] == "string": + name = cobra_obj else: - name = object.id - if name in self.constraints[type]: - self.model.remove_cons_vars(self.constraints[type][name]) - self.constraints[type][name] = self.model.problem.Constraint( - Zero, lb=lower_bound, ub=upper_bound, name=name + "_" + type + name = cobra_obj.id + if name in self.constraints[obj_type]: + self.model.remove_cons_vars(self.constraints[obj_type][name]) + self.constraints[obj_type][name] = self.model.problem.Constraint( + Zero, lb=lower_bound, ub=upper_bound, name=name + "_" + obj_type ) - self.model.add_cons_vars(self.constraints[type][name]) + self.model.add_cons_vars(self.constraints[obj_type][name]) self.model.solver.update() if len(coef) > 0: - self.constraints[type][name].set_linear_coefficients(coef) + self.constraints[obj_type][name].set_linear_coefficients(coef) self.model.solver.update() - return self.constraints[type][name] + return self.constraints[obj_type][name] + + # Utility functions + def print_lp(self, filename=None): + if filename is None: + filename = self.lp_filename + if filename is not None: + with open(filename + ".lp", "w") as out: + complete_line = "" + for line in str(self.model.solver).splitlines(): + if ":" in line: + if complete_line != "": + out.write(complete_line) + complete_line = "" + else: + complete_line += line def all_variables(self): return self.pkgmgr.all_variables() @@ -118,3 +165,6 @@ def add_constraint_type(self, name, type): self.constraints[name] = dict() if name not in self.constraint_types: self.constraint_types[name] = type + + def current_media(self): + return self.pkgmgr.getpkg("KBaseMediaPkg").current_media diff --git a/modelseedpy/fbapkg/bilevelpkg.py b/modelseedpy/fbapkg/bilevelpkg.py index dc2960bc..cb1cf331 100644 --- a/modelseedpy/fbapkg/bilevelpkg.py +++ b/modelseedpy/fbapkg/bilevelpkg.py @@ -3,8 +3,13 @@ from __future__ import absolute_import import re -from optlang.symbolics import Zero, add -from cobra.core import Gene, Metabolite, Model, Reaction +from optlang.symbolics import Zero, add # !!! Neither import is used +from cobra.core import ( + Gene, + Metabolite, + Model, + Reaction, +) # !!! None of these imports are used from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg # Base class for FBA packages @@ -23,51 +28,52 @@ def build_package(self, filter=None, binary_variable_count=0): {}, [], {"binary_variable_count": binary_variable_count} ) print("binary_variable_count:", binary_variable_count) - coefficients = {} - obj_coef = {} - obj = self.model.solver.objective + varhash, coefficients, obj_coef = {}, {}, {} + objective = self.model.solver.objective + # Creating new objective coefficient and bound variables - bound_variables = {} - reactions = self.model.reactions if self.parameters["binary_variable_count"] > 0: for reaction in self.model.reactions: var = self.build_variable("flxcmp", reaction, None) - # Retreiving model data with componenent flux variables - # Using the JSON calls because get_linear_coefficients is REALLY slow + # Retrieving model data with componenent flux variables + # Using the JSON calls because get_linear_coefficients is REALLY slow #!!! get_linear_coefficients is still used? mdldata = self.model.solver.to_json() consthash = {} for const in mdldata["constraints"]: consthash[const["name"]] = const - constraints = list(self.model.solver.constraints) variables = list(self.model.solver.variables) - objterms = obj.get_linear_coefficients(variables) + objterms = objective.get_linear_coefficients(variables) + # Adding binary variables and constraints which should not be included in dual formulation if self.parameters["binary_variable_count"] > 0: for reaction in self.model.reactions: - var = self.build_variable("bflxcmp", reaction, None) + self.build_variable("bflxcmp", reaction, None) + # Now implementing dual variables and constraints - varhash = {} for var in variables: varhash[var.name] = var - for const in constraints: + for const in list(self.model.solver.constraints): var = self.build_variable("dualconst", const, obj_coef) - if ( - var != None - and const.name in consthash - and "expression" in consthash[const.name] - and "args" in consthash[const.name]["expression"] + if all( + [ + var, + const.name in consthash, + "expression" in consthash[const.name], + "args" in consthash[const.name]["expression"], + ] ): for item in consthash[const.name]["expression"]["args"]: - if ( - "args" in item - and len(item["args"]) >= 2 - and item["args"][1]["name"] in varhash + if all( + [ + "args" in item, + len(item["args"]) >= 2, + item["args"][1]["name"] in varhash, + ] ): - if varhash[item["args"][1]["name"]] not in coefficients: - coefficients[varhash[item["args"][1]["name"]]] = {} - coefficients[varhash[item["args"][1]["name"]]][var] = item[ - "args" - ][0]["value"] + var_name = varhash[item["args"][1]["name"]] + if var_name not in coefficients: + coefficients[var_name] = {} + coefficients[var_name][var] = item["args"][0]["value"] for var in variables: if var.type == "continuous": dvar = self.build_variable("duallb", var, obj_coef) @@ -80,95 +86,95 @@ def build_package(self, filter=None, binary_variable_count=0): if var not in coefficients: coefficients[var] = {} coefficients[var][dvar] = 1 - self.build_constraint("dualvar", var, obj, objterms, coefficients) - self.build_constraint("objective", None, obj, objterms, obj_coef) + self.build_constraint("dualvar", var, objective, objterms, coefficients) + self.build_constraint("objective", None, objective, objterms, obj_coef) - def build_variable(self, type, object, obj_coef): - if type == "dualconst": + def build_variable(self, obj_type, cobra_obj, obj_coef): + if obj_type == "dualconst": lb = -1000000 - ub = 1000000 + ub = -lb coef = 0 - if object.lb == None: + if cobra_obj.lb == None: lb = 0 - coef = object.ub - if object.ub == None: + coef = cobra_obj.ub + if cobra_obj.ub == None: ub = 0 - coef = object.lb + coef = cobra_obj.lb var = BaseFBAPkg.build_variable( - self, type, lb, ub, "continuous", object.name + self, obj_type, lb, ub, "continuous", cobra_obj.name ) obj_coef[var] = coef return var if ( - type == "dualub" - ): # Add a constraint that makes this variable zero when binary variable is zero + obj_type == "dualub" + ): # constrain this variable to zero when the binary variable is zero var = BaseFBAPkg.build_variable( - self, type, 0, 1000000, "continuous", object.name + self, obj_type, 0, 1000000, "continuous", cobra_obj.name ) - if re.search("(.+)_(fflxcmp\d+)$", object.name) is not None: - m = re.search("(.+)_(fflxcmp\d+)$", object.name) - bvar = self.variables[m[2]][m[1]] + if re.search("(.+)_(fflxcmp\d+)$", cobra_obj.name) is not None: + match = re.search("(.+)_(fflxcmp\d+)$", cobra_obj.name) + bvar = self.variables[match[2]][match[1]] BaseFBAPkg.build_constraint( - self, "dualbin", None, 0, {var: 1, bvar: -1000000}, object.name + self, "dualbin", None, 0, {var: 1, bvar: -1000000}, cobra_obj.name ) - obj_coef[var] = object.ub + obj_coef[var] = cobra_obj.ub return var - if type == "duallb": + if obj_type == "duallb": var = BaseFBAPkg.build_variable( - self, type, -1000000, 0, "continuous", object.name + self, obj_type, -1000000, 0, "continuous", cobra_obj.name ) - # if re.search('(.+)_(fflxcmp\d+)$', object.name) is not None: + # if re.search('(.+)_(fflxcmp\d+)$', cobra_obj.name) is not None: # m = re.search('(.+)_(fflxcmp\d+)$', metabolite.id) # bvar = self.variables[m[2]][m[1]] - # BaseFBAPkg.build_constraint(self,object.name+"_lbdualbin",None,0,{var:-1,bvar:-1000000},object) - obj_coef[var] = object.lb + # BaseFBAPkg.build_constraint(self,cobra_obj.name+"_lbdualbin",None,0,{var:-1,bvar:-1000000},cobra_obj) + obj_coef[var] = cobra_obj.lb return var - if type == "flxcmp" and self.parameters["binary_variable_count"] > 0: + if obj_type == "flxcmp" and self.parameters["binary_variable_count"] > 0: denominator = 2 ** self.parameters["binary_variable_count"] - 1 coefs = [{}, {}] for i in range(0, self.parameters["binary_variable_count"]): value = 2**i - if object.lower_bound < 0: + if cobra_obj.lower_bound < 0: self.add_variable_type("rflxcmp" + str(i), "reaction") var = BaseFBAPkg.build_variable( self, "rflxcmp" + str(i), 0, - -1 * value * object.lower_bound / denominator, + -1 * value * cobra_obj.lower_bound / denominator, "continuous", - object, + cobra_obj, ) coefs[0][var] = -1 - if object.upper_bound > 0: + if cobra_obj.upper_bound > 0: self.add_variable_type("fflxcmp" + str(i), "reaction") var = BaseFBAPkg.build_variable( self, "fflxcmp" + str(i), 0, - value * object.upper_bound / denominator, + value * cobra_obj.upper_bound / denominator, "continuous", - object, + cobra_obj, ) coefs[1][var] = -1 - if object.lower_bound < 0: + if cobra_obj.lower_bound < 0: # flux - flux_comp_0 - flux_comp_n = 0 - restriction of reverse fluxes by component fluxes self.add_constraint_type("rflxcmpc", "reaction") - coefs[0][object.reverse_variable] = 1 - BaseFBAPkg.build_constraint(self, "rflxcmpc", 0, 0, coefs[0], object) - if object.upper_bound > 0: + coefs[0][cobra_obj.reverse_variable] = 1 + BaseFBAPkg.build_constraint(self, "rflxcmpc", 0, 0, coefs[0], cobra_obj) + if cobra_obj.upper_bound > 0: # flux - flux_comp_0 - flux_comp_n = 0 - restriction of forward fluxes by component fluxes self.add_constraint_type("fflxcmpc", "reaction") - coefs[1][object.forward_variable] = 1 - BaseFBAPkg.build_constraint(self, "fflxcmpc", 0, 0, coefs[1], object) + coefs[1][cobra_obj.forward_variable] = 1 + BaseFBAPkg.build_constraint(self, "fflxcmpc", 0, 0, coefs[1], cobra_obj) return None - if type == "bflxcmp" and self.parameters["binary_variable_count"] > 0: + if obj_type == "bflxcmp" and self.parameters["binary_variable_count"] > 0: for i in range(0, self.parameters["binary_variable_count"]): - if object.lower_bound < 0: + if cobra_obj.lower_bound < 0: self.add_variable_type("brflxcmp" + str(i), "reaction") var = BaseFBAPkg.build_variable( - self, "brflxcmp" + str(i), 0, 1, "binary", object + self, "brflxcmp" + str(i), 0, 1, "binary", cobra_obj ) - othervar = self.variables["rflxcmp" + str(i)][object.id] + othervar = self.variables["rflxcmp" + str(i)][cobra_obj.id] self.add_constraint_type("brflxcmpc" + str(i), "reaction") BaseFBAPkg.build_constraint( self, @@ -176,14 +182,14 @@ def build_variable(self, type, object, obj_coef): None, 0, {othervar: 1, var: -1000}, - object, + cobra_obj, ) - if object.upper_bound > 0: + if cobra_obj.upper_bound > 0: self.add_variable_type("bfflxcmp" + str(i), "reaction") var = BaseFBAPkg.build_variable( - self, "bfflxcmp" + str(i), 0, 1, "binary", object + self, "bfflxcmp" + str(i), 0, 1, "binary", cobra_obj ) - othervar = self.variables["fflxcmp" + str(i)][object.id] + othervar = self.variables["fflxcmp" + str(i)][cobra_obj.id] self.add_constraint_type("bfflxcmpc" + str(i), "reaction") BaseFBAPkg.build_constraint( self, @@ -191,30 +197,30 @@ def build_variable(self, type, object, obj_coef): None, 0, {othervar: 1, var: -1000}, - object, + cobra_obj, ) return None - def build_constraint(self, type, object, objective, objterms, coefficients): - if type == "dualvar": + def build_constraint(self, obj_type, cobra_obj, objective, objterms, coefficients): + if obj_type == "dualvar": coef = {} - lb = 0 - ub = 0 + lb = ub = 0 objsign = 1 if objective.direction == "min": objsign = -1 - if object in objterms: - lb = objterms[object] - ub = objterms[object] - if object in coefficients: - for var in coefficients[object]: - coef[var] = coefficients[object][var] - if object.lb == 0: + if cobra_obj in objterms: + lb = ub = objterms[cobra_obj] + if cobra_obj in coefficients: + for var in coefficients[cobra_obj]: + coef[var] = coefficients[cobra_obj][var] + if cobra_obj.lb == 0: ub = None - elif object.ub == 0: + elif cobra_obj.ub == 0: lb = None - return BaseFBAPkg.build_constraint(self, type, lb, ub, coef, object.name) - elif type == "objective": + return BaseFBAPkg.build_constraint( + self, obj_type, lb, ub, coef, cobra_obj.name + ) + elif obj_type == "objective": coef = {} objsign = 1 if objective.direction == "min": @@ -223,4 +229,6 @@ def build_constraint(self, type, object, objective, objterms, coefficients): coef[var] = objsign * objterms[var] for dvar in coefficients: coef[dvar] = -1 * coefficients[dvar] - return BaseFBAPkg.build_constraint(self, type, 0, 0, coef, "dualobjconst") + return BaseFBAPkg.build_constraint( + self, obj_type, 0, 0, coef, "dualobjconst" + ) diff --git a/modelseedpy/fbapkg/elementuptakepkg.py b/modelseedpy/fbapkg/elementuptakepkg.py index 66e01035..1f61f7a8 100644 --- a/modelseedpy/fbapkg/elementuptakepkg.py +++ b/modelseedpy/fbapkg/elementuptakepkg.py @@ -16,21 +16,37 @@ def __init__(self, model): {"elements": "string"}, ) - def build_package(self, element_limits): + def build_package( + self, element_limits, exception_compounds=[], exception_reactions=[] + ): + # Converting exception compounds list into exception reaction list + self.parameters = { + "element_limits": element_limits, + "exception_compounds": exception_compounds, + "exception_reactions": exception_reactions, + } + exchange_hash = self.modelutl.exchange_hash() + for met in exception_compounds: + if met in exchange_hash: + exception_reactions.append(exchange_hash[met]) + # Now building or rebuilding constraints for element in element_limits: if element not in self.variables["elements"]: self.build_variable(element, element_limits[element]) - self.build_constraint(element) + for element in element_limits: + # This call will first remove existing constraints then build the new constraint + self.build_constraint(element, exception_reactions) def build_variable(self, element, limit): return BaseFBAPkg.build_variable( self, "elements", 0, limit, "continuous", element ) - def build_constraint(self, element): + def build_constraint(self, element, exception_reactions): coef = {self.variables["elements"][element]: -1} - for reaction in self.model.reactions: - if reaction.id[0:3] == "EX_": + rxnlist = self.modelutl.exchange_list() + for reaction in rxnlist: + if reaction not in exception_reactions: total = 0 for metabolite in reaction.metabolites: elements = metabolite.elements diff --git a/modelseedpy/fbapkg/expressionactivationpkg.py b/modelseedpy/fbapkg/expressionactivationpkg.py new file mode 100644 index 00000000..04dda4f8 --- /dev/null +++ b/modelseedpy/fbapkg/expressionactivationpkg.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import +import logging + +logger = logging.getLogger(__name__) +from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg +from modelseedpy.core.fbahelper import FBAHelper + +# Base class for FBA packages +class ExpressionActivationPkg(BaseFBAPkg): + def __init__(self, model): + BaseFBAPkg.__init__( + self, + model, + "ExpressionActivation", + {}, + {} + ) + self.pkgmgr.addpkgs(["ReactionActivationPkg"]) + + def build_package(self,on_hash,off_hash,on_coeff=None,off_coeff=None,other_coef=0.1,max_value=0.001): + activation_filter = {} + for rxn in on_hash: + activation_filter[rxn] = 1 + self.pkgmgr.getpkg("ReactionActivationPkg").build_package(rxn_filter=activation_filter,max_value=max_value) + expression_objective = self.model.problem.Objective(0, direction="min") + obj_coef = dict() + for rxn in self.model.reactions: + if rxn.id in on_hash: + coef = on_coeff + if coef == None: + coef = on_hash[rxn.id] + obj_coef[self.pkgmgr.getpkg("ReactionActivationPkg").variables["fra"][rxn.id]] = -1*coef + obj_coef[self.pkgmgr.getpkg("ReactionActivationPkg").variables["rra"][rxn.id]] = -1*coef + elif rxn.id in off_hash: + coef = off_coeff + if coef == None: + coef = off_hash[rxn.id] + obj_coef[rxn.forward_variable] = coef + obj_coef[rxn.reverse_variable] = coef + elif rxn.id[0:3] == "bio" or rxn.id[0:3] == "EX_" or rxn.id[0:3] == "SK_" or rxn.id[0:3] == "DM_": + pass + else: + obj_coef[rxn.forward_variable] = other_coef + obj_coef[rxn.reverse_variable] = other_coef + self.model.objective = expression_objective + expression_objective.set_linear_coefficients(obj_coef) + self.parameters["gfobj"] = self.model.objective \ No newline at end of file diff --git a/modelseedpy/fbapkg/flexiblebiomasspkg.py b/modelseedpy/fbapkg/flexiblebiomasspkg.py index b3185a4d..223f778d 100644 --- a/modelseedpy/fbapkg/flexiblebiomasspkg.py +++ b/modelseedpy/fbapkg/flexiblebiomasspkg.py @@ -3,9 +3,12 @@ from __future__ import absolute_import import logging -from optlang.symbolics import Zero, add -from cobra import Model, Reaction, Metabolite + +logger = logging.getLogger(__name__) +from optlang.symbolics import Zero, add # !!! Neither import is ever used +from cobra import Model, Reaction, Metabolite # !!! Model and Metabolite are never used from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg +from modelseedpy.core.msmodelutl import MSModelUtil from modelseedpy.core.fbahelper import FBAHelper classes = { @@ -58,11 +61,12 @@ def build_package(self, parameters): parameters, ["bio_rxn_id"], { - "flex_coefficient": 0.75, + "flex_coefficient": [-0.75, 0.75], "use_rna_class": [-0.75, 0.75], "use_dna_class": [-0.75, 0.75], "use_protein_class": [-0.75, 0.75], "use_energy_class": [-0.1, 0.1], + "add_total_biomass_constraint": True, }, ) if self.parameters["bio_rxn_id"] not in self.model.reactions: @@ -79,305 +83,362 @@ def build_package(self, parameters): "cpd00067": None, "cpd00002": None, } - for metabolite in self.model.metabolites: - for msid in refcpd: - if FBAHelper.modelseed_id_from_cobra_metabolite(metabolite) == msid: - refcpd[msid] = metabolite + # Finding all reference compounds in the model + msid_hash = self.modelutl.msid_hash() + for msid in refcpd: + if msid in msid_hash: + refcpd[msid] = msid_hash[msid][0] + met_class = {} + # Determining class for each metabolite in biomass reaction for metabolite in self.parameters["bio_rxn"].metabolites: - msid = FBAHelper.modelseed_id_from_cobra_metabolite(metabolite) - if msid != "cpd11416": - met_class = "none" - if msid != None: + met_class[metabolite] = None + msid = MSModelUtil.metabolite_msid(metabolite) + if ( + msid != "cpd11416" + and msid != "cpd11463" + and msid != "cpd11462" + and msid != "cpd11461" + and msid != None + ): + if msid in refcpd: + met_class[metabolite] = "refcpd" + else: for curr_class in classes: - if msid in classes[curr_class]: - met_class = curr_class + if ( + self.parameters["use_" + curr_class + "_class"] + and msid in classes[curr_class] + ): + met_class[metabolite] = curr_class class_coef[curr_class][msid] = metabolite + # Eliminating any classes that are incomplete + for curr_class in classes: + for msid in classes[curr_class]: + if msid not in class_coef[curr_class]: + self.parameters["use_" + curr_class + "_class"] = None + break + # Creating FLEX reactions and constraints for unclassified compounds + flexcpds = {} + for metabolite in self.parameters["bio_rxn"].metabolites: + if not met_class[metabolite]: + flexcpds[metabolite] = self.parameters["bio_rxn"].metabolites[ + metabolite + ] + elif ( + met_class[metabolite] != "refcpd" + and not self.parameters["use_" + met_class[metabolite] + "_class"] + ): + flexcpds[metabolite] = self.parameters["bio_rxn"].metabolites[ + metabolite + ] + self.modelutl.add_exchanges_for_metabolites( + flexcpds, + uptake=1000, + excretion=1000, + prefix="FLEX_" + self.parameters["bio_rxn"].id + "_", + prefix_name="Biomass flex for ", + ) + for metabolite in flexcpds: + self.build_constraint(metabolite, "flxcpd") + # Creating metabolite class constraints + for met_class in classes: + if self.parameters["use_" + met_class + "_class"]: + add = 0 + total_coef = 0 + object_stoichiometry = {} + for msid in class_coef[met_class]: + if ( + met_class == "rna" + and msid == "cpd00002" + and "cpd00008" in class_coef["energy"] + ): + object_stoichiometry[class_coef[met_class][msid]] = ( + self.parameters["bio_rxn"].metabolites[ + class_coef[met_class][msid] + ] + + self.parameters["bio_rxn"].metabolites[ + class_coef["energy"]["cpd00008"] + ] + ) + else: + object_stoichiometry[ + class_coef[met_class][msid] + ] = self.parameters["bio_rxn"].metabolites[ + class_coef[met_class][msid] + ] + total_coef += abs(object_stoichiometry[class_coef[met_class][msid]]) if ( - met_class == "none" - or self.class_complete(class_coef, met_class) == 0 - or self.parameters["use_" + met_class + "_class"] == None - ) and msid not in refcpd: - drain_reaction = FBAHelper.add_drain_from_metabolite_id( - self.model, metabolite.id, 1000, 1000, "FLEX_" - ) - if drain_reaction.id not in self.new_reactions: - self.new_reactions[drain_reaction.id] = drain_reaction - self.model.add_reactions([drain_reaction]) - self.build_constraint(metabolite, "flxcpd") - for met_class in class_coef: - add = 0 - total_coef = 0 - object_stoichiometry = {} - for msid in class_coef[met_class]: + (met_class == "rna" or met_class == "dna") + and refcpd["cpd00012"] != None + and refcpd["cpd00001"] != None + ): + add = 1 + object_stoichiometry[refcpd["cpd00012"]] = total_coef + object_stoichiometry[refcpd["cpd00001"]] = total_coef + if met_class == "protein" and refcpd["cpd00001"] != None: + add = 1 + object_stoichiometry[refcpd["cpd00001"]] = total_coef if ( - met_class == "rna" - and msid == "cpd00002" - and "cpd00008" in class_coef["energy"] + met_class == "energy" + and refcpd["cpd00001"] != None + and refcpd["cpd00002"] != None + and refcpd["cpd00067"] != None + and refcpd["cpd00009"] != None ): - object_stoichiometry[class_coef[met_class][msid]] = ( - self.parameters["bio_rxn"].metabolites[ - class_coef[met_class][msid] - ] - + self.parameters["bio_rxn"].metabolites[ - class_coef["energy"]["cpd00008"] - ] - ) - else: - object_stoichiometry[class_coef[met_class][msid]] = self.parameters[ - "bio_rxn" - ].metabolites[class_coef[met_class][msid]] - total_coef += abs(object_stoichiometry[class_coef[met_class][msid]]) - if ( - (met_class == "rna" or met_class == "dna") - and refcpd["cpd00012"] != None - and refcpd["cpd00001"] != None - ): - add = 1 - object_stoichiometry[refcpd["cpd00012"]] = total_coef - object_stoichiometry[refcpd["cpd00001"]] = total_coef - if met_class == "protein" and refcpd["cpd00001"] != None: - add = 1 - object_stoichiometry[refcpd["cpd00001"]] = total_coef - if ( - met_class == "energy" - and refcpd["cpd00001"] != None - and refcpd["cpd00002"] != None - and refcpd["cpd00067"] != None - and refcpd["cpd00009"] != None - ): - add = 1 - object_stoichiometry[refcpd["cpd00001"]] = -1 * total_coef - object_stoichiometry[refcpd["cpd00002"]] = -1 * total_coef - object_stoichiometry[refcpd["cpd00009"]] = total_coef - object_stoichiometry[refcpd["cpd00067"]] = total_coef - if add == 1: - if met_class + "_flex" not in self.new_reactions: - self.new_reactions[met_class + "_flex"] = Reaction( - id=met_class + "_flex", - name=met_class + "_flex", - lower_bound=-1000, - upper_bound=1000, - ) - self.new_reactions[met_class + "_flex"].add_metabolites( - object_stoichiometry + add = 1 + object_stoichiometry[refcpd["cpd00001"]] = -1 * total_coef + object_stoichiometry[refcpd["cpd00002"]] = -1 * total_coef + object_stoichiometry[refcpd["cpd00009"]] = total_coef + object_stoichiometry[refcpd["cpd00067"]] = total_coef + if add == 1: + if met_class + "_flex" not in self.new_reactions: + self.new_reactions[met_class + "_flex"] = Reaction( + id=met_class + "_flex", + name=met_class + "_flex", + lower_bound=-1000, + upper_bound=1000, + ) + self.new_reactions[met_class + "_flex"].add_metabolites( + object_stoichiometry + ) + self.new_reactions[met_class + "_flex"].annotation[ + "sbo" + ] = "SBO:0000627" + self.model.add_reactions( + [self.new_reactions[met_class + "_flex"]] + ) + self.build_constraint( + self.new_reactions[met_class + "_flex"], "flxcls" ) - self.new_reactions[met_class + "_flex"].annotation[ - "sbo" - ] = "SBO:0000627" - self.model.add_reactions([self.new_reactions[met_class + "_flex"]]) - self.build_constraint(self.new_reactions[met_class + "_flex"], "flxcls") - self.build_constraint(self.parameters["bio_rxn"], "flxbio") + if parameters["add_total_biomass_constraint"]: + self.build_constraint(self.parameters["bio_rxn"], "flxbio") - def build_variable(self, object, type): + def build_variable(self, object, type): # !!! can the function be removed? pass - def build_constraint(self, object, type): - element_mass = FBAHelper.elemental_mass() - if type == "flxbio": + def build_constraint(self, cobra_obj, obj_type): + if obj_type == "flxbio": # Sum(MW*(vdrn,for-vdrn,ref)) + Sum(massdiff*(vrxn,for-vrxn,ref)) = 0 coef = {} for metabolite in self.parameters["bio_rxn"].metabolites: - if "FLEX_" + metabolite.id in self.model.reactions: + if ( + "FLEX_" + self.parameters["bio_rxn"].id + "_" + metabolite.id + in self.model.reactions + ): mw = FBAHelper.metabolite_mw(metabolite) sign = -1 if self.parameters["bio_rxn"].metabolites[metabolite] > 0: sign = 1 coef[ self.model.reactions.get_by_id( - "FLEX_" + metabolite.id + "FLEX_" + + self.parameters["bio_rxn"].id + + "_" + + metabolite.id ).forward_variable ] = (sign * mw) coef[ self.model.reactions.get_by_id( - "FLEX_" + metabolite.id + "FLEX_" + + self.parameters["bio_rxn"].id + + "_" + + metabolite.id ).reverse_variable ] = (-1 * sign * mw) for met_class in classes: if met_class + "_flex" in self.model.reactions: massdiff = 0 rxn = self.model.reactions.get_by_id(met_class + "_flex") - for metabolite in rxn.metabolites: - mw = FBAHelper.metabolite_mw(metabolite) - massdiff += rxn.metabolites[metabolite] * mw + for met in rxn.metabolites: + mw = FBAHelper.metabolite_mw(met) + massdiff += rxn.metabolites[met] * mw if abs(massdiff) > 0.00001: coef[rxn.forward_variable] = massdiff - coef[rxn.reverse_variable] = -1 * massdiff - return BaseFBAPkg.build_constraint(self, type, 0, 0, coef, object) - elif type == "flxcpd": - # 0.75 * abs(bio_coef) * vbio - vdrn,for >= 0 - # 0.75 * abs(bio_coef) * vbio - vdrn,rev >= 0 - coef = self.parameters["flex_coefficient"] * abs( - self.parameters["bio_rxn"].metabolites[object] - ) - if coef > 0.75: - coef = 0.75 - BaseFBAPkg.build_constraint( - self, - "f" + type, - 0, - None, - { - self.parameters["bio_rxn"].forward_variable: coef, - self.model.reactions.get_by_id( - "FLEX_" + object.id - ).forward_variable: -1, - }, - object, - ) - return BaseFBAPkg.build_constraint( - self, - "r" + type, - 0, - None, - { - self.parameters["bio_rxn"].forward_variable: coef, - self.model.reactions.get_by_id( - "FLEX_" + object.id - ).reverse_variable: -1, - }, - object, - ) - elif type == "flxcls" and object.id[0:-5] != None: - # 0.75 * vbio - vrxn,for >= 0 - # 0.75 * vbio - vrxn,rev >= 0 - # First deal with the situation where the flux is locked into a particular value relative to biomass + coef[rxn.reverse_variable] = -massdiff + return BaseFBAPkg.build_constraint(self, obj_type, 0, 0, coef, cobra_obj) + elif obj_type == "flxcpd" or obj_type == "flxcls": + first_entry = None + second_entry = None + product = False + biovar = self.parameters["bio_rxn"].forward_variable + object = None const = None - if ( - self.parameters["use_" + object.id[0:-5] + "_class"][0] - == self.parameters["use_" + object.id[0:-5] + "_class"][1] + if obj_type == "flxcpd": + # 0.75 * abs(bio_coef) * vbio - vdrn,for >= 0 + # 0.75 * abs(bio_coef) * vbio - vdrn,rev >= 0 + first_entry = self.parameters["flex_coefficient"][0] * abs( + self.parameters["bio_rxn"].metabolites[cobra_obj] + ) + second_entry = self.parameters["flex_coefficient"][1] * abs( + self.parameters["bio_rxn"].metabolites[cobra_obj] + ) + if self.parameters["bio_rxn"].metabolites[cobra_obj] > 0: + product = True + object = self.model.reactions.get_by_id( + "FLEX_" + self.parameters["bio_rxn"].id + "_" + cobra_obj.id + ) + elif ( + cobra_obj.id[0:-5] == None + or not self.parameters["use_" + cobra_obj.id[0:-5] + "_class"] ): + return None + else: + # 0.75 * vbio - vrxn,for >= 0 + # 0.75 * vbio - vrxn,rev >= 0 + first_entry = self.parameters["use_" + cobra_obj.id[0:-5] + "_class"][0] + second_entry = self.parameters["use_" + cobra_obj.id[0:-5] + "_class"][ + 1 + ] + object = cobra_obj + if first_entry == second_entry: # If the value is positive, lock in the forward variable and set the reverse to zero - if self.parameters["use_" + object.id[0:-5] + "_class"][0] > 0: - const = BaseFBAPkg.build_constraint( - self, - "f" + type, - 0, - 0, - { - self.parameters[ - "bio_rxn" - ].forward_variable: self.parameters[ - "use_" + object.id[0:-5] + "_class" - ][ - 1 - ], - object.forward_variable: -1, - }, - object, - ) - object.lower_bound = 0 + if first_entry > 0: + if product: + const = self.build_constraint( + "f" + obj_type, + 0, + 0, + {biovar: second_entry, object.forward_variable: -1}, + cobra_obj, + ) + object.lower_bound = 0 + else: + const = self.build_constraint( + "f" + obj_type, + 0, + 0, + {biovar: second_entry, object.reverse_variable: -1}, + cobra_obj, + ) + object.upper_bound = 0 # If the value is negative, lock in the reverse variable and set the forward to zero - elif self.parameters["use_" + object.id[0:-5] + "_class"][0] < 0: - const = BaseFBAPkg.build_constraint( - self, - "r" + type, - 0, - 0, - { - self.parameters["bio_rxn"].forward_variable: -1 - * self.parameters["use_" + object.id[0:-5] + "_class"][0], - object.reverse_variable: -1, - }, - object, - ) - object.upper_bound = 0 + elif first_entry < 0: + if product: + const = self.build_constraint( + "r" + obj_type, + 0, + 0, + {biovar: -first_entry, object.reverse_variable: -1}, + cobra_obj, + ) + object.upper_bound = 0 + else: + const = self.build_constraint( + "r" + obj_type, + 0, + 0, + {biovar: -first_entry, object.forward_variable: -1}, + cobra_obj, + ) + object.lower_bound = 0 # If the value is zero, lock both variables to zero - if self.parameters["use_" + object.id[0:-5] + "_class"][0] == 0: + if first_entry == 0: object.lower_bound = 0 object.upper_bound = 0 - elif self.parameters["use_" + object.id[0:-5] + "_class"][1] >= 0: - if self.parameters["use_" + object.id[0:-5] + "_class"][0] >= 0: - const = BaseFBAPkg.build_constraint( - self, - "f" + type, - 0, - None, - { - self.parameters[ - "bio_rxn" - ].forward_variable: self.parameters[ - "use_" + object.id[0:-5] + "_class" - ][ - 1 - ], - object.forward_variable: -1, - }, - object, - ) - BaseFBAPkg.build_constraint( - self, - "r" + type, + elif second_entry >= 0: + if first_entry >= 0: + if product: + const = BaseFBAPkg.build_constraint( + self, + "f" + obj_type, + 0, + None, + {biovar: second_entry, object.forward_variable: -1}, + cobra_obj, + ) + object.lower_bound = 0 + if first_entry > 0: + BaseFBAPkg.build_constraint( + self, + "r" + obj_type, + 0, + None, + {biovar: -first_entry, object.forward_variable: 1}, + cobra_obj, + ) + else: + const = BaseFBAPkg.build_constraint( + self, + "f" + obj_type, + 0, + None, + {biovar: second_entry, object.reverse_variable: -1}, + cobra_obj, + ) + object.upper_bound = 0 + if first_entry > 0: + BaseFBAPkg.build_constraint( + self, + "r" + obj_type, + 0, + None, + {biovar: -first_entry, object.reverse_variable: 1}, + cobra_obj, + ) + else: + if product: + const = self.build_constraint( + "f" + obj_type, + 0, + None, + {biovar: second_entry, object.forward_variable: -1}, + cobra_obj, + ) + self.build_constraint( + "r" + obj_type, + 0, + None, + {biovar: -first_entry, object.reverse_variable: -1}, + cobra_obj, + ) + else: + const = self.build_constraint( + "f" + obj_type, + 0, + None, + {biovar: second_entry, object.reverse_variable: -1}, + cobra_obj, + ) + self.build_constraint( + "r" + obj_type, + 0, + None, + {biovar: -first_entry, object.forward_variable: -1}, + cobra_obj, + ) + else: + if second_entry < 0: + if product: + const = self.build_constraint( + "f" + obj_type, + 0, + None, + {biovar: second_entry, object.reverse_variable: 1}, + cobra_obj, + ) + else: + const = self.build_constraint( + "f" + obj_type, + 0, + None, + {biovar: second_entry, object.forward_variable: 1}, + cobra_obj, + ) + if product: + self.build_constraint( + "r" + obj_type, 0, None, - { - self.parameters["bio_rxn"].forward_variable: -1 - * self.parameters["use_" + object.id[0:-5] + "_class"][0], - object.forward_variable: 1, - }, - object, + {biovar: -first_entry, object.reverse_variable: -1}, + cobra_obj, ) object.lower_bound = 0 else: - const = BaseFBAPkg.build_constraint( - self, - "f" + type, - 0, - None, - { - self.parameters[ - "bio_rxn" - ].forward_variable: self.parameters[ - "use_" + object.id[0:-5] + "_class" - ][ - 1 - ], - object.forward_variable: -1, - }, - object, - ) - BaseFBAPkg.build_constraint( - self, - "r" + type, + self.build_constraint( + "r" + obj_type, 0, None, - { - self.parameters["bio_rxn"].forward_variable: -1 - * self.parameters["use_" + object.id[0:-5] + "_class"][0], - object.reverse_variable: -1, - }, - object, + {biovar: -first_entry, object.forward_variable: -1}, + cobra_obj, ) - else: - const = BaseFBAPkg.build_constraint( - self, - "f" + type, - 0, - None, - { - self.parameters["bio_rxn"].forward_variable: self.parameters[ - "use_" + object.id[0:-5] + "_class" - ][1], - object.reverse_variable: 1, - }, - object, - ) - BaseFBAPkg.build_constraint( - self, - "r" + type, - 0, - None, - { - self.parameters["bio_rxn"].forward_variable: -1 - * self.parameters["use_" + object.id[0:-5] + "_class"][0], - object.reverse_variable: -1, - }, - object, - ) - object.upper_bound = 0 + object.upper_bound = 0 return const - - def class_complete(self, class_coef, met_class): - for msid in classes[met_class]: - if msid not in class_coef[met_class]: - return 0 - return 1 diff --git a/modelseedpy/fbapkg/fluxfittingpkg.py b/modelseedpy/fbapkg/fluxfittingpkg.py old mode 100644 new mode 100755 index f4f8f05d..810f2567 --- a/modelseedpy/fbapkg/fluxfittingpkg.py +++ b/modelseedpy/fbapkg/fluxfittingpkg.py @@ -3,7 +3,9 @@ from __future__ import absolute_import import logging -from optlang.symbolics import Zero, add + +logger = logging.getLogger(__name__) +from optlang.symbolics import Zero, add # !!! Zero is never used from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg # Base class for FBA packages @@ -25,11 +27,10 @@ def build_package(self, parameters): "rescale_vfit_by_flux": True, }, ) - if self.parameters["totalflux"] == 0: + if self.parameters["totalflux"] == 1: self.pkgmgr.getpkg("RevBinPkg", 1).build_package( self.parameters["target_flux"] ) - else: self.pkgmgr.getpkg("TotalFluxPkg", 1).build_package( self.parameters["target_flux"] ) @@ -39,7 +40,7 @@ def build_package(self, parameters): rxnobj = self.model.reactions.get_by_id(rxnid) var = self.build_variable(rxnobj) objvars.append(var**2) - const = self.build_constraint(rxnobj) + self.build_constraint(rxnobj) if self.parameters["set_objective"] == 1: self.model.objective = self.model.problem.Objective( add(objvars), direction="min", sloppy=True @@ -47,24 +48,28 @@ def build_package(self, parameters): def build_variable(self, object): return BaseFBAPkg.build_variable( - self, "vfit", -1000, 1000, "continuous", object + self, "vfit", -100000, 100000, "continuous", object ) - def build_constraint(self, object): + def build_constraint(self, cobra_obj): # vfit(i) = flux(i) - v(i) - if object.id in self.parameters["target_flux"]: - flux = self.parameters["target_flux"][object.id] + if cobra_obj.id in self.parameters["target_flux"]: + flux = self.parameters["target_flux"][cobra_obj.id] vfitcoef = 1 # if self.parameters["rescale_vfit_by_flux"] == True: # if flux != None and abs(flux) > 0: # vfitcoef = vfitcoef*flux#Multiply coef by fit flux which rescales by flux # else: # vfitcoef = vfitcoef*self.parameters["default_rescaling"]#Multiply coef by fit flux which rescales by flux - coef = {self.variables["vfit"][object.id]: vfitcoef} + coef = {self.variables["vfit"][cobra_obj.id]: vfitcoef} if self.parameters["totalflux"] == 0: - coef[object.forward_variable] = 1 - coef[object.reverse_variable] = -1 + coef[cobra_obj.forward_variable] = 1 + coef[cobra_obj.reverse_variable] = -1 else: - coef[self.pkgmgr.getpkg("TotalFluxPkg").variables["tf"][object.id]] = 1 + coef[ + self.pkgmgr.getpkg("TotalFluxPkg").variables["tf"][cobra_obj.id] + ] = 1 # !!! the total flux package does not return anything flux = abs(flux) - return BaseFBAPkg.build_constraint(self, "vfitc", flux, flux, coef, object) + return BaseFBAPkg.build_constraint( + self, "vfitc", flux, flux, coef, cobra_obj + ) diff --git a/modelseedpy/fbapkg/foldchangefittingpkg.py b/modelseedpy/fbapkg/foldchangefittingpkg.py new file mode 100755 index 00000000..0859520f --- /dev/null +++ b/modelseedpy/fbapkg/foldchangefittingpkg.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import + +import logging + +logger = logging.getLogger(__name__) +from optlang.symbolics import Zero, add # !!! Zero is never used +from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg + +# Base class for FBA packages +class FoldChangeFittingPkg(BaseFBAPkg): + def __init__(self, model): + BaseFBAPkg.__init__( + self, model, "fold change fitting", {}, {} + ) + + def build_package(self, parameters): + self.validate_parameters( + parameters, + ["msexp"], + { + "quadratic": True, + "condition_models": {} + }, + ) + ref_flux_pkg_args = { + "coef": None, + "fixed": True, + "set_objective": False, + "quadratic": self.parameters["quadratic"], + "objective_coef":{}, + "default_objective_coef": 1.0 + } + rxn_expression = self + if isinstance(self.object, MSGenome): + rxn_expression = self.build_reaction_expression(self.model.model) + + for feature in self.parameters["msexp"].features: + if feature.id in self.model.reactions: + ref_flux_pkg_args["objective_coef"][feature.id] = 1 + ref_flux_pkg_args["coef"][feature.id] = 1 + #Adding the reference flux variables to the main model + self.pkgmgr.getpkg("ReferenceFluxPkg").build_package(ref_flux_pkg_args) + ref_flux_pkg_args["fixed"] = False + ref_flux_pkg_args["set_objective"] = True + other_models = [] + for condition in self.parameters["condition_models"]: + if condition not in self.parameters["msexp"].conditions: + raise ValueError("Condition "+condition+" not found in expression data") + mdl = self.parameters["condition_models"][condition] + #Adding the reference flux variables to each model + mdl.pkgmgr.getpkg("ReferenceFluxPkg").build_package(ref_flux_pkg_args) + other_models.append(mdl) + shared_var_pkgs = {"ReferenceFluxPkg": ["refv"]} + #Replicating the other models into the main model + self.pkgmgr.getpkg("ProblemReplicationPkg").build_package( + { + "models": other_models, + "shared_variable_packages": shared_var_pkgs + } + ) + + def build_variables(self, object,coef=None): + BaseFBAPkg.build_variable( + self, "refv", object.lower_bound, object.upper_bound, "continuous", object + ) + lower_bound = -1000 + if not self.parameters["quadratic"]: + BaseFBAPkg.build_variable( + self, "nrefer", 0, 1000, "continuous", object + ) + lower_bound = 0 + BaseFBAPkg.build_variable( + self, "preferr", lower_bound, 1000, "continuous", object + ) + + def build_constraints(self, object): + # Variable: preferr(i) - nrefer(i) = refv(i) - forward_flux(i) + reverse_flux(i) + # Fixed: refv(i) = forward_flux(i) - reverse_flux(i) + coef = { + self.variables["vfit"][object.id]: 1, + object.forward_variable: -1, + object.reverse_variable: 1 + } + if not self.parameters["fixed"]: + coef[self.variables["preferr"][object.id]] = -1 + if not self.parameters["quadratic"]: + coef[self.variables["nrefer"][object.id]] = 1 + return BaseFBAPkg.build_constraint( + self, "refvc", 0, 0, coef, object + ) \ No newline at end of file diff --git a/modelseedpy/fbapkg/gapfillingpkg.py b/modelseedpy/fbapkg/gapfillingpkg.py index 1ccc98f3..d52d440e 100644 --- a/modelseedpy/fbapkg/gapfillingpkg.py +++ b/modelseedpy/fbapkg/gapfillingpkg.py @@ -3,364 +3,45 @@ from __future__ import absolute_import import logging +import sys import re import json from optlang.symbolics import Zero, add from cobra import Model, Reaction, Metabolite +from cobra.flux_analysis import pfba +from cobra.io import ( + load_json_model, + save_json_model, + load_matlab_model, + save_matlab_model, + read_sbml_model, + write_sbml_model, +) from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg from modelseedpy.core.fbahelper import FBAHelper logger = logging.getLogger(__name__) +logger.setLevel( + logging.INFO +) # When debugging - set this to INFO then change needed messages below from DEBUG to INFO -default_blacklist = [ - "rxn12985", - "rxn00238", - "rxn07058", - "rxn05305", - "rxn00154", - "rxn09037", - "rxn10643", - "rxn11317", - "rxn05254", - "rxn05257", - "rxn05258", - "rxn05259", - "rxn05264", - "rxn05268", - "rxn05269", - "rxn05270", - "rxn05271", - "rxn05272", - "rxn05273", - "rxn05274", - "rxn05275", - "rxn05276", - "rxn05277", - "rxn05278", - "rxn05279", - "rxn05280", - "rxn05281", - "rxn05282", - "rxn05283", - "rxn05284", - "rxn05285", - "rxn05286", - "rxn05963", - "rxn05964", - "rxn05971", - "rxn05989", - "rxn05990", - "rxn06041", - "rxn06042", - "rxn06043", - "rxn06044", - "rxn06045", - "rxn06046", - "rxn06079", - "rxn06080", - "rxn06081", - "rxn06086", - "rxn06087", - "rxn06088", - "rxn06089", - "rxn06090", - "rxn06091", - "rxn06092", - "rxn06138", - "rxn06139", - "rxn06140", - "rxn06141", - "rxn06145", - "rxn06217", - "rxn06218", - "rxn06219", - "rxn06220", - "rxn06221", - "rxn06222", - "rxn06223", - "rxn06235", - "rxn06362", - "rxn06368", - "rxn06378", - "rxn06474", - "rxn06475", - "rxn06502", - "rxn06562", - "rxn06569", - "rxn06604", - "rxn06702", - "rxn06706", - "rxn06715", - "rxn06803", - "rxn06811", - "rxn06812", - "rxn06850", - "rxn06901", - "rxn06971", - "rxn06999", - "rxn07123", - "rxn07172", - "rxn07254", - "rxn07255", - "rxn07269", - "rxn07451", - "rxn09037", - "rxn10018", - "rxn10077", - "rxn10096", - "rxn10097", - "rxn10098", - "rxn10099", - "rxn10101", - "rxn10102", - "rxn10103", - "rxn10104", - "rxn10105", - "rxn10106", - "rxn10107", - "rxn10109", - "rxn10111", - "rxn10403", - "rxn10410", - "rxn10416", - "rxn11313", - "rxn11316", - "rxn11318", - "rxn11353", - "rxn05224", - "rxn05795", - "rxn05796", - "rxn05797", - "rxn05798", - "rxn05799", - "rxn05801", - "rxn05802", - "rxn05803", - "rxn05804", - "rxn05805", - "rxn05806", - "rxn05808", - "rxn05812", - "rxn05815", - "rxn05832", - "rxn05836", - "rxn05851", - "rxn05857", - "rxn05869", - "rxn05870", - "rxn05884", - "rxn05888", - "rxn05896", - "rxn05898", - "rxn05900", - "rxn05903", - "rxn05904", - "rxn05905", - "rxn05911", - "rxn05921", - "rxn05925", - "rxn05936", - "rxn05947", - "rxn05956", - "rxn05959", - "rxn05960", - "rxn05980", - "rxn05991", - "rxn05992", - "rxn05999", - "rxn06001", - "rxn06014", - "rxn06017", - "rxn06021", - "rxn06026", - "rxn06027", - "rxn06034", - "rxn06048", - "rxn06052", - "rxn06053", - "rxn06054", - "rxn06057", - "rxn06059", - "rxn06061", - "rxn06102", - "rxn06103", - "rxn06127", - "rxn06128", - "rxn06129", - "rxn06130", - "rxn06131", - "rxn06132", - "rxn06137", - "rxn06146", - "rxn06161", - "rxn06167", - "rxn06172", - "rxn06174", - "rxn06175", - "rxn06187", - "rxn06189", - "rxn06203", - "rxn06204", - "rxn06246", - "rxn06261", - "rxn06265", - "rxn06266", - "rxn06286", - "rxn06291", - "rxn06294", - "rxn06310", - "rxn06320", - "rxn06327", - "rxn06334", - "rxn06337", - "rxn06339", - "rxn06342", - "rxn06343", - "rxn06350", - "rxn06352", - "rxn06358", - "rxn06361", - "rxn06369", - "rxn06380", - "rxn06395", - "rxn06415", - "rxn06419", - "rxn06420", - "rxn06421", - "rxn06423", - "rxn06450", - "rxn06457", - "rxn06463", - "rxn06464", - "rxn06466", - "rxn06471", - "rxn06482", - "rxn06483", - "rxn06486", - "rxn06492", - "rxn06497", - "rxn06498", - "rxn06501", - "rxn06505", - "rxn06506", - "rxn06521", - "rxn06534", - "rxn06580", - "rxn06585", - "rxn06593", - "rxn06609", - "rxn06613", - "rxn06654", - "rxn06667", - "rxn06676", - "rxn06693", - "rxn06730", - "rxn06746", - "rxn06762", - "rxn06779", - "rxn06790", - "rxn06791", - "rxn06792", - "rxn06793", - "rxn06794", - "rxn06795", - "rxn06796", - "rxn06797", - "rxn06821", - "rxn06826", - "rxn06827", - "rxn06829", - "rxn06839", - "rxn06841", - "rxn06842", - "rxn06851", - "rxn06866", - "rxn06867", - "rxn06873", - "rxn06885", - "rxn06891", - "rxn06892", - "rxn06896", - "rxn06938", - "rxn06939", - "rxn06944", - "rxn06951", - "rxn06952", - "rxn06955", - "rxn06957", - "rxn06960", - "rxn06964", - "rxn06965", - "rxn07086", - "rxn07097", - "rxn07103", - "rxn07104", - "rxn07105", - "rxn07106", - "rxn07107", - "rxn07109", - "rxn07119", - "rxn07179", - "rxn07186", - "rxn07187", - "rxn07188", - "rxn07195", - "rxn07196", - "rxn07197", - "rxn07198", - "rxn07201", - "rxn07205", - "rxn07206", - "rxn07210", - "rxn07244", - "rxn07245", - "rxn07253", - "rxn07275", - "rxn07299", - "rxn07302", - "rxn07651", - "rxn07723", - "rxn07736", - "rxn07878", - "rxn11417", - "rxn11582", - "rxn11593", - "rxn11597", - "rxn11615", - "rxn11617", - "rxn11619", - "rxn11620", - "rxn11624", - "rxn11626", - "rxn11638", - "rxn11648", - "rxn11651", - "rxn11665", - "rxn11666", - "rxn11667", - "rxn11698", - "rxn11983", - "rxn11986", - "rxn11994", - "rxn12006", - "rxn12007", - "rxn12014", - "rxn12017", - "rxn12022", - "rxn12160", - "rxn12161", - "rxn01267", - "rxn05294", - "rxn04656", -] +base_blacklist = {"rxn04656":"=","rxn07589":"<","rxn07588":"<"} +zero_threshold = 1e-8 class GapfillingPkg(BaseFBAPkg): """ """ def __init__(self, model): - BaseFBAPkg.__init__(self, model, "gapfilling", {}, {}) + BaseFBAPkg.__init__( + self, + model, + "gapfilling", + {"rmaxf": "reaction", "fmaxf": "reaction"}, + {"rmaxfc": "reaction", "fmaxfc": "reaction"}, + ) self.gapfilling_penalties = None + self.maxflux_variables = {} def build(self, template, minimum_objective=0.01): parameters = { @@ -394,7 +75,7 @@ def build_package(self, parameters): parameters, [], { - "auto_sink": ["cpd02701", "cpd11416", "cpd15302"], + "auto_sink": ["cpd01042","cpd02701", "cpd11416", "cpd15302", "cpd03091"], "extend_with_template": 1, "model_penalty": 1, "default_gapfill_models": [], @@ -405,12 +86,17 @@ def build_package(self, parameters): "gapfill_all_indecies_with_default_templates": 1, "gapfill_all_indecies_with_default_models": 1, "default_excretion": 100, - "default_uptake": -100, + "default_uptake": 0, "minimum_obj": 0.01, - "set_objective": 1, - "blacklist": default_blacklist, - }, + "minimize_exchanges": False, + "blacklist": [], + "base_media": None, + "objective":self.model.objective, + "base_media_target_element": "C", + "default_exchange_penalty":0.1 + } ) + # Adding model reactions to original reaction list self.parameters["original_reactions"] = [] for rxn in self.model.reactions: @@ -422,17 +108,29 @@ def build_package(self, parameters): self.parameters["original_reactions"].append([rxn, "<"]) if rxn.upper_bound > 0: self.parameters["original_reactions"].append([rxn, ">"]) + # Adding constraint for target reaction - self.parameters["origobj"] = self.model.objective - self.pkgmgr.getpkg("ObjConstPkg").build_package( - self.parameters["minimum_obj"], None - ) - + self.set_base_objective(self.parameters["objective"],self.parameters["minimum_obj"]) + + #Extending model + self.extend_model_for_gapfilling() + + #Computing gapfilling penalties + self.compute_gapfilling_penalties() + + # Creating the gapfilling objective function and saving it under self.parameters["gfobj"] + self.build_gapfilling_objective_function() + + def extend_model_for_gapfilling(self): + """Extends the model for gapfilling + Parameters + ---------- + None + """ # Determine all indecies that should be gapfilled indexhash = self.get_model_index_hash() - # Iterating over all indecies with more than 10 intracellular compounds: - self.gapfilling_penalties = dict() + self.base_gapfilling_penalties = dict() for index in indexhash: if indexhash[index] > 10: if index == "none": @@ -440,12 +138,12 @@ def build_package(self, parameters): new_penalties = self.extend_model_with_template_for_gapfilling( template, index ) - self.gapfilling_penalties.update(new_penalties) + self.base_gapfilling_penalties.update(new_penalties) for gfmdl in self.parameters["default_gapfill_models"]: new_penalties = self.extend_model_with_model_for_gapfilling( gfmdl, index ) - self.gapfilling_penalties.update(new_penalties) + self.base_gapfilling_penalties.update(new_penalties) if index in self.parameters["gapfill_templates_by_index"]: for template in self.parameters["gapfill_templates_by_index"][ index @@ -453,67 +151,162 @@ def build_package(self, parameters): new_penalties = self.extend_model_with_template_for_gapfilling( template, index ) - self.gapfilling_penalties.update(new_penalties) + self.base_gapfilling_penalties.update(new_penalties) if index in self.parameters["gapfill_models_by_index"]: for gfmdl in self.parameters["gapfill_models_by_index"]: new_penalties = self.extend_model_with_model_for_gapfilling( gfmdl, index ) - self.gapfilling_penalties.update(new_penalties) + self.base_gapfilling_penalties.update(new_penalties) if self.parameters["gapfill_all_indecies_with_default_templates"]: for template in self.parameters["default_gapfill_templates"]: new_penalties = self.extend_model_with_template_for_gapfilling( template, index ) - self.gapfilling_penalties.update(new_penalties) + self.base_gapfilling_penalties.update(new_penalties) if self.parameters["gapfill_all_indecies_with_default_models"]: for gfmdl in self.parameters["default_gapfill_models"]: new_penalties = self.extend_model_with_model_for_gapfilling( gfmdl, index ) - self.gapfilling_penalties.update(new_penalties) + self.base_gapfilling_penalties.update(new_penalties) + + def compute_gapfilling_penalties(self,exclusion_solution=None,reaction_scores=None): + """Builds gapfilling objective function for model + Parameters + ---------- + exclusion_solution : [string rxn_id,string direction] + Solution with reaction directions that should be removed from the gapfilling objective function + """ + self.gapfilling_penalties = self.base_gapfilling_penalties.copy() + #Removing exclusion solution reactions from penalties dictionary + if exclusion_solution: + for item in exclusion_solution: + if item[0] in self.gapfilling_penalties: + if item[1] == ">" and "forward" in self.gapfilling_penalties[item[0]]: + del self.gapfilling_penalties[item[0]]["forward"] + elif item[1] == "<" and "reverse" in self.gapfilling_penalties[item[0]]: + del self.gapfilling_penalties[item[0]]["reverse"] # Rescaling penalties by reaction scores and saving genes + if not reaction_scores: + reaction_scores = self.parameters["reaction_scores"] for reaction in self.gapfilling_penalties: rxnid = reaction.split("_")[0] - if rxnid in self.parameters["reaction_scores"]: - highest_score = 0 - for gene in self.parameters["reaction_scores"][rxnid]: - if highest_score < self.parameters["reaction_scores"][rxnid][gene]: - highest_score = self.parameters["reaction_scores"][rxnid][gene] - factor = 0.1 - if "reverse" in self.gapfilling_penalties[reaction]: - self.gapfilling_penalties[reaction]["reverse"] = ( - factor * self.gapfilling_penalties[reaction]["reverse"] + highest_score = 0 + if rxnid in reaction_scores: + for gene in reaction_scores[rxnid]: + score = None + if isinstance(reaction_scores[rxnid][gene], dict): + score = reaction_scores[rxnid][gene]["probability"] + else: + score = reaction_scores[rxnid][gene] + if highest_score < score: + highest_score = score + factor = 2-highest_score + if "reverse" in self.gapfilling_penalties[reaction]: + self.gapfilling_penalties[reaction]["reverse"] = ( + factor * self.gapfilling_penalties[reaction]["reverse"] + ) + if "forward" in self.gapfilling_penalties[reaction]: + self.gapfilling_penalties[reaction]["forward"] = ( + factor * self.gapfilling_penalties[reaction]["forward"] + ) + + def build_gapfilling_objective_function(self): + """Builds gapfilling objective function for model + """ + reaction_objective = self.model.problem.Objective(Zero, direction="min") + obj_coef = dict() + for reaction in self.model.reactions: + if reaction.id in self.gapfilling_penalties: + # Minimizing gapfilled reactions + if "reverse" in self.gapfilling_penalties[reaction.id]: + obj_coef[reaction.reverse_variable] = abs( + self.gapfilling_penalties[reaction.id]["reverse"] ) - if "forward" in self.gapfilling_penalties[reaction]: - self.gapfilling_penalties[reaction]["forward"] = ( - factor * self.gapfilling_penalties[reaction]["forward"] + if "forward" in self.gapfilling_penalties[reaction.id]: + obj_coef[reaction.forward_variable] = abs( + self.gapfilling_penalties[reaction.id]["forward"] + ) + else: + obj_coef[reaction.forward_variable] = 0 + obj_coef[reaction.reverse_variable] = 0 + self.model.objective = reaction_objective + reaction_objective.set_linear_coefficients(obj_coef) + self.parameters["gfobj"] = self.model.objective + + def create_max_flux_variables(self): + """Creates max flux variables needed for the global gapfilling formulation + Parameters + ---------- + """ + for reaction in self.model.reactions: + if reaction.id in self.gapfilling_penalties: + if "reverse" in self.gapfilling_penalties[reaction.id]: + if reaction.id not in self.maxflux_variables: + self.maxflux_variables[reaction.id] = {} + self.maxflux_variables[reaction.id][ + "reverse" + ] = self.build_variable( + "rmaxf", 0, 1000, "continuous", reaction + ) + self.build_constraint( + "rmaxfc", + 0, + None, + { + reaction.reverse_variable: -1, + self.maxflux_variables[reaction.id]["reverse"]: 1, + }, + reaction, + ) + if "forward" in self.gapfilling_penalties[reaction.id]: + if reaction.id not in self.maxflux_variables: + self.maxflux_variables[reaction.id] = {} + self.maxflux_variables[reaction.id][ + "forward" + ] = self.build_variable( + "fmaxf", 0, 1000, "continuous", reaction + ) + self.build_constraint( + "fmaxfc", + 0, + None, + { + reaction.forward_variable: -1, + self.maxflux_variables[reaction.id]["forward"]: 1, + }, + reaction, ) - self.model.solver.update() - if self.parameters["set_objective"] == 1: - reaction_objective = self.model.problem.Objective(Zero, direction="min") - obj_coef = dict() - for reaction in self.model.reactions: - if reaction.id in self.gapfilling_penalties: - # Minimizing gapfilled reactions - if "reverse" in self.gapfilling_penalties[reaction.id]: - obj_coef[reaction.reverse_variable] = abs( - self.gapfilling_penalties[reaction.id]["reverse"] - ) - # elif default_penalty != 0: - # obj_coef[reaction.reverse_variable] = 0 - if "forward" in self.gapfilling_penalties[reaction.id]: - obj_coef[reaction.forward_variable] = abs( - self.gapfilling_penalties[reaction.id]["forward"] - ) - # elif default_penalty != 0: - # obj_coef[reaction.forward_variable] = 0 - else: - obj_coef[reaction.forward_variable] = 0 - obj_coef[reaction.reverse_variable] = 0 - self.model.objective = reaction_objective - reaction_objective.set_linear_coefficients(obj_coef) + def set_base_objective(self,objective,minobjective): + """Sets the base objective for the model + Parameters + ---------- + objective : string | model.objective + ID of reaction to be maximized as the objective or model objective object + minobjective : float + Minimal objective value to be used + """ + #Setting the objective based on the objective argument + if isinstance(objective, str): + self.model.objective = self.model.reactions.get_by_id(objective).flux_expression + self.model.objective.direction = "max" + else: + self.model.objective = objective + #Setting original objective field + self.original_objective = self.model.objective + #Setting minimal objective constraint + self.pkgmgr.getpkg("ObjConstPkg").clear() + if minobjective: + if self.model.objective.direction == "max": + self.pkgmgr.getpkg("ObjConstPkg").build_package( + minobjective, None + ) + else: + self.pkgmgr.getpkg("ObjConstPkg").build_package( + None, minobjective + ) def extend_model_with_model_for_gapfilling(self, source_model, index): new_metabolites = {} @@ -550,6 +343,11 @@ def extend_model_with_model_for_gapfilling(self, source_model, index): if re.search("(.+)_([a-z])\d+$", modelreaction.id) != None: m = re.search("(.+)_([a-z])\d+$", modelreaction.id) if m[1] not in self.parameters["blacklist"]: + if m[1] in base_blacklist: + if base_blacklist[m[1]] == ">" or base_blacklist[m[1]] == "=": + cobra_reaction.upper_bound = 0 + if base_blacklist[m[1]] == "<" or base_blacklist[m[1]] == "=": + cobra_reaction.lower_bound = 0 cobra_reaction = modelreaction.copy() cobra_reaction.id = groups[1] + "_" + groups[2] + index if ( @@ -672,13 +470,24 @@ def extend_model_with_template_for_gapfilling(self, template, index): new_exchange, new_demand = self.extend_model_with_template_metabolites( template, index ) - + for template_reaction in template.reactions: if template_reaction.reference_id in self.parameters["blacklist"]: continue cobra_reaction = self.convert_template_reaction( template_reaction, index, template, 1 ) # TODO: move function out + if template_reaction.reference_id in base_blacklist: + if ( + base_blacklist[template_reaction.reference_id] == ">" + or base_blacklist[template_reaction.reference_id] == "=" + ): + cobra_reaction.upper_bound = 0 + if ( + base_blacklist[template_reaction.reference_id] == "<" + or base_blacklist[template_reaction.reference_id] == "=" + ): + cobra_reaction.lower_bound = 0 new_penalties[cobra_reaction.id] = dict() if ( cobra_reaction.id not in self.model.reactions @@ -718,7 +527,7 @@ def extend_model_with_template_for_gapfilling(self, template, index): self.parameters["default_excretion"], ) for ex in exchanges: - new_penalties[ex.id] = {"added": 1, "reverse": 1, "forward": 1} + new_penalties[ex.id] = {"added": 1, "reverse": self.parameters["default_exchange_penalty"], "forward": self.parameters["default_exchange_penalty"]} # Only run this on new demands so we don't readd for all exchanges exchanges = self.modelutl.add_exchanges_for_metabolites( @@ -728,7 +537,7 @@ def extend_model_with_template_for_gapfilling(self, template, index): "DM_", ) for ex in exchanges: - new_penalties[ex.id] = {"added": 1, "reverse": 1, "forward": 1} + new_penalties[ex.id] = {"added": 1, "reverse": self.parameters["default_exchange_penalty"], "forward": self.parameters["default_exchange_penalty"]} # Adding all new reactions to the model at once (much faster than one at a time) self.model.add_reactions(new_reactions.values()) @@ -802,6 +611,19 @@ def convert_template_reaction( return cobra_reaction + def set_media(self, media): + if self.parameters["base_media"]: + reaction_exceptions = [] + for mediacpd in media.mediacompounds: + if not self.parameters["base_media"].find_mediacpd(mediacpd.id): + ex_hash = mediacpd.get_mdl_exchange_hash(self.modelutl) + for mdlcpd in ex_hash: + reaction_exceptions.append(ex_hash[mdlcpd]) + self.modelutl.pkgmgr.getpkg("ElementUptakePkg").build_package( + {self.parameters["base_media_target_element"]:1}, exception_reactions=reaction_exceptions + ) + self.modelutl.pkgmgr.getpkg("KBaseMediaPkg").build_package(media, self.parameters["default_uptake"], self.parameters["default_excretion"]) + def binary_check_gapfilling_solution(self, solution=None, flux_values=None): if solution is None: solution = self.compute_gapfilled_solution() @@ -843,12 +665,12 @@ def knockout_gf_reactions_outside_solution(self, solution=None, flux_values=None if rxnobj.id in self.gapfilling_penalties: if ( "reverse" in self.gapfilling_penalties[rxnobj.id] - and flux_values[rxnobj.id]["reverse"] <= Zero + and flux_values[rxnobj.id]["reverse"] <= zero_threshold ): rxnobj.lower_bound = 0 if ( "forward" in self.gapfilling_penalties[rxnobj.id] - and flux_values[rxnobj.id]["forward"] <= Zero + and flux_values[rxnobj.id]["forward"] <= zero_threshold ): rxnobj.upper_bound = 0 rxnobj.update_variable_bounds() @@ -869,7 +691,7 @@ def run_test_conditions(self, condition_list, solution=None, max_iterations=10): with self.model: # Setting all gapfilled reactions not in the solution to zero self.knockout_gf_reactions_outside_solution(solution) - self.pkgmgr.getpkg("ObjConstPkg").constraints["objc"]["1"].lb = 0 + self.reset_objective_minimum(0,False) for condition in condition_list: condition["change"] = True filtered_list = self.modelutl.reaction_expansion_test( @@ -879,7 +701,7 @@ def run_test_conditions(self, condition_list, solution=None, max_iterations=10): condition["change"] = False if len(filtered_list) > 0: if max_iterations > 0: - print("Gapfilling test failed " + str(11 - max_iterations)) + logger.warning("Gapfilling test failed " + str(11 - max_iterations)) # Forcing filtered reactions to zero for item in filtered_list: if item[1] == ">": @@ -887,9 +709,7 @@ def run_test_conditions(self, condition_list, solution=None, max_iterations=10): else: self.model.reactions.get_by_id(item[0].id).lower_bound = 0 # Restoring lower bound on biomass constraint - self.pkgmgr.getpkg("ObjConstPkg").constraints["objc"][ - "1" - ].lb = self.parameters["minimum_obj"] + self.reset_objective_minimum(self.parameters["minimum_obj"]) # Reoptimizing self.model.optimize() return self.run_test_conditions( @@ -898,43 +718,121 @@ def run_test_conditions(self, condition_list, solution=None, max_iterations=10): return None return solution - def filter_database_based_on_tests(self, test_conditions): - filetered_list = [] - with self.model: - rxnlist = [] - for reaction in self.model.reactions: - if reaction.id in self.gapfilling_penalties: - if "reverse" in self.gapfilling_penalties[reaction.id]: - rxnlist.append([reaction, "<"]) - if "forward" in self.gapfilling_penalties[reaction.id]: - rxnlist.append([reaction, ">"]) - self.pkgmgr.getpkg("ObjConstPkg").constraints["objc"]["1"].lb = 0 - filtered_list = self.modelutl.reaction_expansion_test( - rxnlist, test_conditions - ) + def test_gapfill_database(self,active_reactions=[]): + self.reset_objective_minimum(0,False) + self.model.objective = self.original_objective + self.test_solution = self.model.optimize() + logger.info( + "Objective with gapfill database:" + + str(self.test_solution.objective_value) + + "; min objective:" + + str(self.parameters["minimum_obj"]) + ) + self.reset_objective_minimum(self.parameters["minimum_obj"]) + self.model.objective = self.parameters["gfobj"] + if self.test_solution.objective_value < self.parameters["minimum_obj"] or self.test_solution.status == 'infeasible': + return False + #Running pFBA to determine active reactions for nonzero objective + solution = pfba(self.model) + for rxn in self.model.reactions: + if solution.fluxes[rxn.id] > 0: + active_reactions.append([rxn.id,">"]) + return True + + def reset_objective_minimum(self, min_objective,reset_params=True): + if reset_params and min_objective != 0: + self.parameters["minimum_obj"] = min_objective + if "1" not in self.pkgmgr.getpkg("ObjConstPkg").constraints["objc"]: + self.pkgmgr.getpkg("ObjConstPkg").build_package(min_objective, None) + if min_objective == 0: + if self.parameters["minimum_obj"] > 0: + self.pkgmgr.getpkg("ObjConstPkg").constraints["objc"]["1"].lb = 0 + if self.parameters["minimum_obj"] < 0: + self.pkgmgr.getpkg("ObjConstPkg").constraints["objc"]["1"].ub = 0 + else: + if min_objective > 0: + self.pkgmgr.getpkg("ObjConstPkg").constraints["objc"]["1"].lb = min_objective + if min_objective < 0: + self.pkgmgr.getpkg("ObjConstPkg").constraints["objc"]["1"].ub = min_objective + + def filter_database_based_on_tests(self,test_conditions,growth_conditions=[],base_filter=None,base_target="rxn00062_c0",base_filter_only=False,all_noncore=True,active_reaction_sets=[]): + #Saving the current media + current_media = self.current_media() + #Clearing element uptake constraints + self.pkgmgr.getpkg("ElementUptakePkg").clear() + # Setting the minimal growth constraint to zero + self.reset_objective_minimum(0,False) + # Applying base filter + base_filter_list = [] + if base_filter != None: + for media_id in base_filter: + if base_target in base_filter[media_id]: + for threshold in base_filter[media_id][base_target]: + for rxn_id in base_filter[media_id][base_target][threshold]: + for direction in base_filter[media_id][base_target][threshold][rxn_id]: + if rxn_id in self.model.reactions: + rxnobj = self.model.reactions.get_by_id(rxn_id) + base_filter_list.append([rxnobj,direction]) + if direction == ">": + rxnobj.upper_bound = 0 + else: + rxnobj.lower_bound = 0 + # Filtering the database of any reactions that violate the specified tests + filtered_list = [] + if not base_filter_only: + with self.model: + rxnlist = [] + rxndict = {} + for reaction in self.model.reactions: + if reaction.id in self.gapfilling_penalties: + rxndict[reaction.id] = 1 + if "reverse" in self.gapfilling_penalties[reaction.id]: + rxnlist.append([reaction, "<"]) + if "forward" in self.gapfilling_penalties[reaction.id]: + rxnlist.append([reaction, ">"]) + elif all_noncore and not self.modelutl.is_core(reaction): + rxndict[reaction.id] = 1 + if reaction.lower_bound < 0: + rxnlist.append([reaction, "<"]) + if reaction.upper_bound > 0: + rxnlist.append([reaction, ">"]) + print("Full model:",len(self.modelutl.model.reactions)) + print("Gapfilling count:",len(self.gapfilling_penalties)) + print("Reaction list:",len(rxndict)) + filtered_list = self.modelutl.reaction_expansion_test( + rxnlist, test_conditions,active_reaction_sets=active_reaction_sets + )#,positive_growth=growth_conditions + #Adding base filter reactions to model + if base_filter != None: + gf_filter_att = self.modelutl.get_attributes("gf_filter", {}) + for media_id in base_filter: + if media_id not in gf_filter_att: + gf_filter_att[media_id] = {} + if base_target in base_filter[media_id]: + if base_target not in gf_filter_att[media_id]: + gf_filter_att[media_id][base_target] = {} + for threshold in base_filter[media_id][base_target]: + if threshold not in gf_filter_att[media_id][base_target]: + gf_filter_att[media_id][base_target][threshold] = {} + for rxn_id in base_filter[media_id][base_target][threshold]: + if rxn_id not in gf_filter_att[media_id][base_target][threshold]: + gf_filter_att[media_id][base_target][threshold][rxn_id] = {} + for direction in base_filter[media_id][base_target][threshold][rxn_id]: + if direction not in gf_filter_att[media_id][base_target][threshold][rxn_id]: + gf_filter_att[media_id][base_target][threshold][rxn_id][direction] = {} + gf_filter_att[media_id][base_target][threshold][rxn_id][direction][direction] = base_filter[media_id][base_target][threshold][rxn_id][direction] # Now constraining filtered reactions to zero for item in filtered_list: - logger.debug("Filtering:", item[0].id, item[1]) + logger.debug("Filtering:" + item[0].id + item[1]) if item[1] == ">": self.model.reactions.get_by_id(item[0].id).upper_bound = 0 else: self.model.reactions.get_by_id(item[0].id).lower_bound = 0 # Now testing if the gapfilling minimum objective can still be achieved - gfobj = self.model.objective - self.model.objective = self.parameters["origobj"] - solution = self.model.optimize() - # Restoring the minimum objective constraint - self.pkgmgr.getpkg("ObjConstPkg").constraints["objc"]["1"].lb = self.parameters[ - "minimum_obj" - ] - print( - "Objective after filtering:", - solution.objective_value, - "; min objective:", - self.parameters["minimum_obj"], - ) - if solution.objective_value < self.parameters["minimum_obj"]: + if not self.test_gapfill_database(): # Now we need to restore a minimal set of filtered reactions such that we permit the minimum objective to be reached + # Restoring the minimum objective constraint + self.reset_objective_minimum(self.parameters["minimum_obj"]) new_objective = self.model.problem.Objective(Zero, direction="min") filterobjcoef = dict() for item in filtered_list: @@ -945,7 +843,6 @@ def filter_database_based_on_tests(self, test_conditions): else: filterobjcoef[rxn.reverse_variable] = item[3] rxn.lower_bound = item[2] - self.model.objective = new_objective new_objective.set_linear_coefficients(filterobjcoef) solution = self.model.optimize() @@ -967,22 +864,25 @@ def filter_database_based_on_tests(self, test_conditions): else: count += -1 rxn.lower_bound = 0 - print("Reactions unfiltered:", count) + logger.debug("Reactions unfiltered:" + str(count)) # Checking for model reactions that can be removed to enable all tests to pass - self.pkgmgr.getpkg("ObjConstPkg").constraints["objc"]["1"].lb = 0 + self.reset_objective_minimum(0,False) filtered_list = self.modelutl.reaction_expansion_test( - self.parameters["original_reactions"], test_conditions + self.parameters["original_reactions"], test_conditions,positive_growth=growth_conditions ) - for item in filtered_list: - logger.debug("Filtering:", item[0].id, item[1]) - if item[1] == ">": - self.model.reactions.get_by_id(item[0].id).upper_bound = 0 - else: - self.model.reactions.get_by_id(item[0].id).lower_bound = 0 - self.pkgmgr.getpkg("ObjConstPkg").constraints["objc"][ - "1" - ].lb = self.parameters["minimum_obj"] - self.model.objective = gfobj + if filtered_list: + for item in filtered_list: + logger.debug("Filtering:" + item[0].id + item[1]) + if item[1] == ">": + self.model.reactions.get_by_id(item[0].id).upper_bound = 0 + else: + self.model.reactions.get_by_id(item[0].id).lower_bound = 0 + # Restoring gapfilling objective function and minimal objective constraint + self.reset_objective_minimum(self.parameters["minimum_obj"]) + self.model.objective = self.parameters["gfobj"] + if current_media: + self.set_media(current_media) + return True def compute_gapfilled_solution(self, flux_values=None): if flux_values is None: @@ -991,19 +891,23 @@ def compute_gapfilled_solution(self, flux_values=None): for reaction in self.model.reactions: if reaction.id in self.gapfilling_penalties: if ( - flux_values[reaction.id]["forward"] > Zero + flux_values[reaction.id]["forward"] > zero_threshold and "forward" in self.gapfilling_penalties[reaction.id] ): if "added" in self.gapfilling_penalties[reaction.id]: + logger.debug(f"New gapfilled reaction: {reaction.id} >") output["new"][reaction.id] = ">" else: + logger.debug(f"Reversed gapfilled reaction: {reaction.id} >") output["reversed"][reaction.id] = ">" elif ( - flux_values[reaction.id]["reverse"] > Zero + flux_values[reaction.id]["reverse"] > zero_threshold and "reverse" in self.gapfilling_penalties[reaction.id] ): if "added" in self.gapfilling_penalties[reaction.id]: + logger.debug(f"New gapfilled reaction: {reaction.id} <") output["new"][reaction.id] = "<" else: + logger.debug(f"Reversed gapfilled reaction: {reaction.id} <") output["reversed"][reaction.id] = "<" return output diff --git a/modelseedpy/fbapkg/kbasemediapkg.py b/modelseedpy/fbapkg/kbasemediapkg.py index 4dbf0779..92525b30 100644 --- a/modelseedpy/fbapkg/kbasemediapkg.py +++ b/modelseedpy/fbapkg/kbasemediapkg.py @@ -16,6 +16,7 @@ class KBaseMediaPkg(BaseFBAPkg): def __init__(self, model): BaseFBAPkg.__init__(self, model, "kbase media", {}, {}) + self.current_media = None def build_package( self, media_or_parameters, default_uptake=None, default_excretion=None @@ -40,14 +41,21 @@ def build_package( self.parameters["default_uptake"] = 0 if self.parameters["default_excretion"] is None: self.parameters["default_excretion"] = 100 - if self.parameters["media"] is None and self.parameters["default_uptake"] == 0: + self.current_media = self.parameters["media"] + if ( + self.parameters["media"] and self.parameters["media"].name == "Complete" + ) and self.parameters["default_uptake"] == 0: self.parameters["default_uptake"] = 100 # First initializing all exchanges to default uptake and excretion exchange_list = self.modelutl.exchange_list() for reaction in exchange_list: - reaction.lower_bound = -1 * self.parameters["default_uptake"] - reaction.upper_bound = self.parameters["default_excretion"] + if -1 * self.parameters["default_uptake"] > reaction.upper_bound: + reaction.upper_bound = self.parameters["default_excretion"] + reaction.lower_bound = -1 * self.parameters["default_uptake"] + else: + reaction.lower_bound = -1 * self.parameters["default_uptake"] + reaction.upper_bound = self.parameters["default_excretion"] # Now constraining exchanges for specific compounds specified in the media if self.parameters["media"]: diff --git a/modelseedpy/fbapkg/objconstpkg.py b/modelseedpy/fbapkg/objconstpkg.py index de1dc3b5..0963e190 100644 --- a/modelseedpy/fbapkg/objconstpkg.py +++ b/modelseedpy/fbapkg/objconstpkg.py @@ -11,12 +11,22 @@ def __init__(self, model): BaseFBAPkg.__init__(self, model, "objective constraint", {}, {"objc": "none"}) def build_package(self, lower_bound, upper_bound): - self.build_constraint(lower_bound, upper_bound) + return self.build_constraint(lower_bound, upper_bound) def build_constraint(self, lower_bound, upper_bound): coef = self.model.solver.objective.get_linear_coefficients( self.model.solver.objective.variables ) + #Check if the constraint already exists and if so, just updating bounds in place + for name in self.constraints["objc"]: + constraint = self.constraints["objc"][name] + existing_coef = constraint.get_linear_coefficients( + constraint.variables + ) + if coef == existing_coef: + constraint.lb = lower_bound + constraint.ub = upper_bound + return constraint return BaseFBAPkg.build_constraint( self, "objc", lower_bound, upper_bound, coef, None ) diff --git a/modelseedpy/fbapkg/objectivepkg.py b/modelseedpy/fbapkg/objectivepkg.py new file mode 100644 index 00000000..4adf908e --- /dev/null +++ b/modelseedpy/fbapkg/objectivepkg.py @@ -0,0 +1,132 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import + +import logging +from optlang.symbolics import Zero, add +from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg + +logger = logging.getLogger(__name__) +logger.setLevel( + logging.WARNING#INFO +) # When debugging - set this to INFO then change needed messages below from DEBUG to INFO + +class ObjectiveTerm: + def __init__(self, variable, coefficient,direction=""): + self.coefficient = coefficient + self.variable = variable + self.direction = direction + + @staticmethod + def from_string(term_string): + coefficient = 1 + variable = None + direction = "" + #Checking for coefficient + if term_string[0:1] == "(": + array = term_string.split(")") + coefficient = float(array[0][1]) + term_string = array[1] + #Checking for a +/- on term + if term_string[0:1] == "+" or term_string[0:1] == "-": + variable = term_string[1:] + direction = term_string[0:1] + else: + variable = term_string + direction = "" + return ObjectiveTerm(variable, coefficient, direction) + + def to_string(self): + return "("+str(self.coefficient)+")"+self.direction+self.variable + + +#Class for defining an objective function in a modelseedpy model. +class ObjectiveData: + def __init__(self, terms, sign=1): + self.sign = sign + self.terms = terms + + @staticmethod + def from_string(objective_string): + sign = 1 + terms = [] + if objective_string[0:3] == "MAX": + objective_string = objective_string[4:-1]#Clearing out the directionality MAX{} + elif objective_string[0:3] == "MIN": + sign = -1 + objective_string = objective_string[4:-1]#Clearing out the directionality MIN{} + term_strings = objective_string.split("|") + for term_string in term_strings: + term = ObjectiveTerm.from_string(term_string) + terms.append(term) + return ObjectiveData(terms, sign) + + def to_string(self): + objective_string = "" + if self.sign == 1: + objective_string += "MAX{" + else: + objective_string += "MIN{" + for term in self.terms: + objective_string += term.to_string()+"|" + objective_string = objective_string[:-1] + "}" + return objective_string + + def to_cobrapy_objective(self, model): + #Creating empty objective + objective = model.problem.Objective(Zero, direction="max") + #Parsing the terms + coefficients = {} + for term in self.terms: + if term.variable in model.reactions: + coef = term.coefficient * self.sign + rxnobj = model.reactions.get_by_id(term.variable) + if term.direction == "+": + coefficients[rxnobj.forward_variable] = coef + elif term.direction == "-": + coefficients[rxnobj.reverse_variable] = coef + else: + coefficients[rxnobj.forward_variable] = coef + coefficients[rxnobj.reverse_variable] = -1*coef + else: + logger.warning("Reaction "+term.variable+" not found in model") + model.objective = objective + objective.set_linear_coefficients(coefficients) + return objective + +# Base class for FBA packages +class ObjectivePkg(BaseFBAPkg): + def __init__(self, model): + BaseFBAPkg.__init__(self, model, "objective builder", {}, {}) + self.original_model_objective = None + self.objective_name = None + self.objective_data = None + self.objective_data_cache = {} + + def build_package(self,objective_or_string,objective_name=None,set_objective=True): + #Caching the current objective + self.original_model_objective = self.model.objective + #check if input is a string or an ObjectiveData object + if isinstance(objective_or_string, str): + self.objective_data = ObjectiveData.from_string(objective_or_string) + elif isinstance(objective_or_string, ObjectiveData): + self.objective_data = objective_or_string + else: + raise TypeError("Input must be a string or an ObjectiveData object") + #Setting default objective name if not provided + self.objective_name = objective_name + if objective_name == None: + self.objective_name = self.objective_data.to_string() + #Caching objective with name + self.objective_data_cache[self.objective_name] = self.objective_data + #Creating the objective in the model + if set_objective: + self.objective_data_cache[self.objective_name].to_cobrapy_objective(self.model) + return objective_name + + def restore_objective(self,name): + self.original_model_objective = self.model.objective + if name in self.objective_data_cache: + self.model.objective = self.objective_data_cache[name].to_cobrapy_objective(self.model) + else: + logger.warning("Objective "+name+" not found in cache") \ No newline at end of file diff --git a/modelseedpy/fbapkg/problemreplicationpkg.py b/modelseedpy/fbapkg/problemreplicationpkg.py index 062abc33..71e81244 100644 --- a/modelseedpy/fbapkg/problemreplicationpkg.py +++ b/modelseedpy/fbapkg/problemreplicationpkg.py @@ -21,12 +21,13 @@ def build_package(self, parameters): # First loading shared variables into a hash shared_var_hash = {} for pkg in self.parameters["shared_variable_packages"]: + fbapkg = self.modelutl.pkgmgr.getpkg(pkg) for type in self.parameters["shared_variable_packages"][pkg]: - if type in pkg.variables: - for objid in pkg.variables[type]: + if type in fbapkg.variables: + for objid in fbapkg.variables[type]: shared_var_hash[ - pkg.variables[type][objid].name - ] = pkg.variables[type][objid] + fbapkg.variables[type][objid].name + ] = fbapkg.variables[type][objid] # Now copying over variables and constraints from other models and replacing shared variables count = 0 for othermdl in self.parameters["models"]: @@ -36,8 +37,9 @@ def build_package(self, parameters): new_var_hash = {} for var in othermdl.variables: if var.name not in shared_var_hash: - newvar = Variable.clone(var) - newvar.name = var.name + "." + str(count) + newvar = self.model.problem.Variable( + var.name + "." + str(count), lb=var.lb, ub=var.ub, type=var.type + ) self.variables[str(count)][var.name] = newvar new_var_hash[var.name] = newvar newobj.append(newvar) diff --git a/modelseedpy/fbapkg/proteomefittingpkg.py b/modelseedpy/fbapkg/proteomefittingpkg.py index 469efc08..3aedacb5 100644 --- a/modelseedpy/fbapkg/proteomefittingpkg.py +++ b/modelseedpy/fbapkg/proteomefittingpkg.py @@ -7,7 +7,7 @@ from optlang.symbolics import Zero, add from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg from modelseedpy.core.fbahelper import FBAHelper -from modelseedpy.multiomics.msexpression import MSExpression, GENOME, MODEL, COLUMN_NORM +from modelseedpy.multiomics.msexpression import MSExpression # Options for default behavior LOWEST = 10 @@ -45,7 +45,7 @@ def build_package(self, parameters): ) objvars = [] # Converting genome proteome to reaction proteome if necessary - if self.parameters["proteome"].type == GENOME: + if self.parameters["proteome"].type == "genome": self.parameters["proteome"] = self.parameters[ "proteome" ].build_reaction_expression( @@ -123,7 +123,7 @@ def build_constraint(self, object, type): # kvfit(i) = kapp(i)*ProtCoef*Prot(i) - v(i) # Pulling expression value for selected condition and reaction expval = self.parameters["proteome"].get_value( - object.id, self.parameters["condition"], COLUMN_NORM + object.id, self.parameters["condition"], "column_norm" ) if expval is None and self.parameters["default_expression"] is not None: if self.parameters["default_expression"] == LOWEST: diff --git a/modelseedpy/fbapkg/reactionactivationpkg.py b/modelseedpy/fbapkg/reactionactivationpkg.py new file mode 100644 index 00000000..f43bac06 --- /dev/null +++ b/modelseedpy/fbapkg/reactionactivationpkg.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import +import logging + +logger = logging.getLogger(__name__) +from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg +from modelseedpy.core.fbahelper import FBAHelper + +# Base class for FBA packages +class ReactionActivationPkg(BaseFBAPkg): + def __init__(self, model): + BaseFBAPkg.__init__( + self, + model, + "ReactionActivation", + {"fra": "reaction", "rra": "reaction"}, + { + "fra": "reaction", + "rra": "reaction" + } + ) + + def build_package(self, rxn_filter=None,max_value=0.001): + self.pkgmgr.getpkg("RevBinPkg").build_package(filter=rxn_filter) + for rxn in self.model.reactions: + # Checking that reaction passes input filter if one is provided + if rxn_filter == None: + self.build_variable(rxn,max_value) + self.build_constraint(rxn) + elif rxn.id in rxn_filter: + self.build_variable(rxn,max_value) + self.build_constraint(rxn) + + def build_variable(self, cobra_obj,max_value): + variable = BaseFBAPkg.build_variable(self, "fra", 0,max_value, "continuous", cobra_obj) + variable = BaseFBAPkg.build_variable(self, "rra", 0,max_value, "continuous", cobra_obj) + return variable + + def build_constraint(self, cobra_obj): + constraint = None + if cobra_obj.id not in self.constraints["fra"]: + constraint = BaseFBAPkg.build_constraint( + self, + "fra", + None, + 0, + { + self.variables["fra"][cobra_obj.id]: 1, + cobra_obj.forward_variable: -1, + }, + cobra_obj, + ) + if cobra_obj.id not in self.constraints["rra"]: + constraint = BaseFBAPkg.build_constraint( + self, + "rra", + None, + 0, + { + self.variables["rra"][cobra_obj.id]: 1, + cobra_obj.reverse_variable: -1 + }, + cobra_obj, + ) + return constraint \ No newline at end of file diff --git a/modelseedpy/fbapkg/reactionusepkg.py b/modelseedpy/fbapkg/reactionusepkg.py index c68c9a44..f3e17bc9 100644 --- a/modelseedpy/fbapkg/reactionusepkg.py +++ b/modelseedpy/fbapkg/reactionusepkg.py @@ -1,10 +1,12 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import - import logging -from optlang.symbolics import Zero, add + +logger = logging.getLogger(__name__) +from optlang.symbolics import Zero, add # !!! add is never used from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg +from modelseedpy.core.fbahelper import FBAHelper # Base class for FBA packages class ReactionUsePkg(BaseFBAPkg): @@ -22,62 +24,70 @@ def __init__(self, model): }, ) - def build_package(self, filter=None, reversibility=0): - for reaction in self.model.reactions: + def build_package(self, rxn_filter=None, reversibility=False): + for rxn in self.model.reactions: # Checking that reaction passes input filter if one is provided - if filter == None: - self.build_variable(reaction, "=") - self.build_constraint(reaction, reversibility) - elif reaction.id in filter: - self.build_variable(reaction, filter[reaction.id]) - self.build_constraint(reaction, reversibility) + if rxn_filter == None: + self.build_variable(rxn, "=") + self.build_constraint(rxn, reversibility) + elif rxn.id in rxn_filter: + self.build_variable(rxn, rxn_filter[rxn.id]) + self.build_constraint(rxn, reversibility) - def build_variable(self, object, direction): + def build_variable(self, cobra_obj, direction): variable = None if ( (direction == ">" or direction == "=") - and object.upper_bound > 0 - and object.id not in self.variables["fu"] + and cobra_obj.upper_bound > 0 + and cobra_obj.id not in self.variables["fu"] ): - variable = BaseFBAPkg.build_variable(self, "fu", 0, 1, "binary", object) + variable = BaseFBAPkg.build_variable(self, "fu", 0, 1, "binary", cobra_obj) if ( (direction == "<" or direction == "=") - and object.lower_bound < 0 - and object.id not in self.variables["ru"] + and cobra_obj.lower_bound < 0 + and cobra_obj.id not in self.variables["ru"] ): - variable = BaseFBAPkg.build_variable(self, "ru", 0, 1, "binary", object) + variable = BaseFBAPkg.build_variable(self, "ru", 0, 1, "binary", cobra_obj) return variable - def build_constraint(self, object, reversibility): + def build_constraint(self, cobra_obj, reversibility): constraint = None if ( - object.id not in self.constraints["fu"] - and object.id in self.variables["fu"] + cobra_obj.id not in self.constraints["fu"] + and cobra_obj.id in self.variables["fu"] ): constraint = BaseFBAPkg.build_constraint( self, "fu", 0, None, - {self.variables["fu"][object.id]: 1000, object.forward_variable: -1}, - object, + { + self.variables["fu"][cobra_obj.id]: 1000, + cobra_obj.forward_variable: -1, + }, + cobra_obj, ) if ( - object.id not in self.constraints["ru"] - and object.id in self.variables["ru"] + cobra_obj.id not in self.constraints["ru"] + and cobra_obj.id in self.variables["ru"] ): constraint = BaseFBAPkg.build_constraint( self, "ru", 0, None, - {self.variables["ru"][object.id]: 1000, object.reverse_variable: -1}, - object, + { + self.variables["ru"][cobra_obj.id]: 1000, + cobra_obj.reverse_variable: -1, + }, + cobra_obj, ) - if ( - reversibility == 1 - and object.id in self.variables["ru"] - and object.id in self.variables["fu"] + if all( + [ + reversibility, + cobra_obj.id in self.variables["ru"], + cobra_obj.id in self.variables["fu"], + ] ): constraint = BaseFBAPkg.build_constraint( self, @@ -85,24 +95,25 @@ def build_constraint(self, object, reversibility): None, 1, { - self.variables["ru"][object.id]: 1, - self.variables["fu"][object.id]: 1, + self.variables["ru"][cobra_obj.id]: 1, + self.variables["fu"][cobra_obj.id]: 1, }, - object, + cobra_obj, ) return constraint def build_exclusion_constraint(self, flux_values=None): - if flux_values == None: - flux_values = FBAHelper.compute_flux_values_from_variables(self.model) + flux_values = flux_values or FBAHelper.compute_flux_values_from_variables( + self.model + ) count = len(self.constraints["exclusion"]) solution_coef = {} solution_size = 0 - for rxnid in flux_values: - if flux_values[rxnid] > Zero: + for rxnid, flux in flux_values.items(): + if flux > Zero: solution_size += 1 solution_coef[self.variables["fu"][rxnid]] = 1 - elif flux_values[rxnid] < -1 * Zero: + elif flux < -1 * Zero: solution_size += 1 solution_coef[self.variables["ru"][rxnid]] = 1 if len(solution_coef) > 0: diff --git a/modelseedpy/fbapkg/referencefluxpkg.py b/modelseedpy/fbapkg/referencefluxpkg.py new file mode 100755 index 00000000..fe0a30a8 --- /dev/null +++ b/modelseedpy/fbapkg/referencefluxpkg.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import + +import logging + +logger = logging.getLogger(__name__) +from optlang.symbolics import Zero, add # !!! Zero is never used +from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg + +# Base class for FBA packages +class ReferenceFluxPkg(BaseFBAPkg): + def __init__(self, model): + BaseFBAPkg.__init__( + self, model, "reference flux", {"refv": "reaction","preferr":"reaction","nrefer":"reaction"}, {"refvc": "reaction"} + ) + + def build_package(self, parameters): + self.validate_parameters( + parameters, + [], + { + "coef": {}, + "fixed": False, + "set_objective": True + "quadratic": True, + "objective_coef":{}, + "default_objective_coef": 1.0 + }, + ) + for rxnid in self.parameters["coef"]: + if rxnid in self.model.reactions: + rxnobj = self.model.reactions.get_by_id(rxnid) + self.build_variables(rxnobj) + self.build_constraints(rxnobj) + objvars = {} + if self.parameters["set_objective"]: + if self.parameters["fixed"]: + logger.warning("In fixed mode, no objective will be created") + else: + for rxnid in self.variables["preferr"]: + objcoef = self.parameters["default_objective_coef"] + if rxnid in self.parameters["objective_coef"]: + objcoef = self.parameters["objective_coef"][rxnid] + if self.parameters["quadratic"]: + objvars[self.variables["preferr"][rxnid]] = objcoef + else: + objvars[self.variables["preferr"][rxnid]] = objcoef + objvars[self.variables["nrefer"][rxnid]] = objcoef + self.model.objective = self.model.problem.Objective( + add(objvars), direction="min", sloppy=True + ) + + def build_variables(self, object,coef=None): + BaseFBAPkg.build_variable( + self, "refv", object.lower_bound, object.upper_bound, "continuous", object + ) + lower_bound = -1000 + if not self.parameters["quadratic"]: + BaseFBAPkg.build_variable( + self, "nrefer", 0, 1000, "continuous", object + ) + lower_bound = 0 + BaseFBAPkg.build_variable( + self, "preferr", lower_bound, 1000, "continuous", object + ) + + def build_constraints(self, object): + # Variable: preferr(i) - nrefer(i) = refv(i) - forward_flux(i) + reverse_flux(i) + # Fixed: refv(i) = forward_flux(i) - reverse_flux(i) + coef = { + self.variables["vfit"][object.id]: 1, + object.forward_variable: -1, + object.reverse_variable: 1 + } + if not self.parameters["fixed"]: + coef[self.variables["preferr"][object.id]] = -1 + if not self.parameters["quadratic"]: + coef[self.variables["nrefer"][object.id]] = 1 + return BaseFBAPkg.build_constraint( + self, "refvc", 0, 0, coef, object + ) \ No newline at end of file diff --git a/modelseedpy/multiomics/msexpression.py b/modelseedpy/multiomics/msexpression.py index 02453e34..0df06d8c 100644 --- a/modelseedpy/multiomics/msexpression.py +++ b/modelseedpy/multiomics/msexpression.py @@ -1,26 +1,28 @@ # -*- coding: utf-8 -*- import logging +from typing import Optional, Union, TYPE_CHECKING +#from numpy._core.numeric import True_ +import pandas as pd +import numpy as np import re import copy from cobra.core.dictlist import DictList -from cobra.core.gene import Gene, ast2str, eval_gpr, parse_gpr +from cobra.core.gene import Gene, ast2str, eval_gpr, parse_gpr, GPR +from cobra import Solution from ast import And, BitAnd, BitOr, BoolOp, Expression, Name, NodeTransformer, Or from modelseedpy.core.msgenome import MSGenome, MSFeature - -# Types of expression data -GENOME = 10 -MODEL = 20 - -# Types of normalization -COLUMN_NORM = 10 +from modelseedpy.core.msmodelutl import MSModelUtil logger = logging.getLogger(__name__) +def compute_gene_score(expr, values, default, datatype): + # Handle tuple return from parse_gpr() in newer COBRApy versions + if isinstance(expr, tuple) and len(expr) == 2: + expr = expr[0] # Extract GPR object from (GPR, frozenset) tuple -def compute_gene_score(expr, values, default): - if isinstance(expr, Expression): - return compute_gene_score(expr.body, values, default) + if isinstance(expr, (Expression, GPR)): + return compute_gene_score(expr.body, values, default, datatype) elif isinstance(expr, Name): if expr.id in values: return values[expr.id] @@ -29,17 +31,37 @@ def compute_gene_score(expr, values, default): elif isinstance(expr, BoolOp): op = expr.op if isinstance(op, Or): - total = 0 + best = None + total = None for subexpr in expr.values: - total += compute_gene_score(subexpr, values, default) + value = compute_gene_score(subexpr, values, default, datatype) + if value != None: + if datatype == "NormalizedRatios": + diff = abs(value - 1) + if best == None or diff > best: + best = diff + total = value + elif datatype == "RelativeAbundance" or datatype == "FPKM" or datatype == "TPM" or datatype == "AbsoluteAbundance": + if total == None: + total = 0 + total += value return total elif isinstance(op, And): - least = None + best = None + best_value = None for subexpr in expr.values: - value = compute_gene_score(subexpr, values, default) - if least == None or value < least: - least = value - return least + value = compute_gene_score(subexpr, values, default, datatype) + if value != None: + if datatype == "NormalizedRatios": + diff = abs(value - 1) + if best == None or diff > best: + best = diff + best_value = value + elif datatype == "RelativeAbundance" or datatype == "FPKM" or datatype == "TPM" or datatype == "AbsoluteAbundance": + if best == None or value < best: + best = value + best_value = value + return best_value else: raise TypeError("unsupported operation " + op.__class__.__name__) elif expr is None: @@ -47,111 +69,483 @@ def compute_gene_score(expr, values, default): else: raise TypeError("unsupported operation " + repr(expr)) - class MSCondition: - def __init__(self, id): + def __init__(self, id, parent): self.id = id - self.column_sum = None - self.feature_count = None - self.lowest = None + self.parent = parent + + def value_at_zscore(self, zscore: float) -> Optional[float]: + """Calculate the value at a given z-score for this condition. + + Args: + zscore: The z-score threshold + + Returns: + The value at the specified z-score, or None if no data + """ + if self.id not in self.parent._data.columns: + return None + + values = self.parent._data[self.id].dropna() + if len(values) == 0: + return None + + mean = values.mean() + std_dev = values.std() + return mean + (zscore * std_dev) + + def lowest_value(self) -> Optional[float]: + """Get the minimum value for this condition. + Returns: + The minimum value, or None if no data + """ + if self.id not in self.parent._data.columns: + return None + + values = self.parent._data[self.id].dropna() + if len(values) == 0: + return None + + return float(values.min()) + + def highest_value(self) -> Optional[float]: + """Get the maximum value for this condition. + + Returns: + The maximum value, or None if no data + """ + if self.id not in self.parent._data.columns: + return None + + values = self.parent._data[self.id].dropna() + if len(values) == 0: + return None + + return float(values.max()) + + def average_value(self) -> Optional[float]: + """Get the mean value for this condition. + + Returns: + The mean value, or None if no data + """ + if self.id not in self.parent._data.columns: + return None + + values = self.parent._data[self.id].dropna() + if len(values) == 0: + return None + + return float(values.mean()) + + def sum_value(self) -> float: + """Get the sum of all values for this condition. + + Returns: + The sum of all values (0.0 if no data) + """ + if self.id not in self.parent._data.columns: + return 0.0 + + return float(self.parent._data[self.id].sum(skipna=True)) class MSExpressionFeature: def __init__(self, feature, parent): self.id = feature.id self.feature = feature - self.values = {} self.parent = parent - def add_value(self, condition, value): - if condition in self.values: - condition.feature_count += -1 - condition.column_sum += -1 * value - logger.warning( - "Overwriting value " - + str(self.values[condition]) - + " with " - + str(value) - + " in feature " - + self.feature.id - ) - if condition.lowest is None or condition.lowest > value: - condition.lowest = value - condition.feature_count += 1 - condition.column_sum += value - self.values[condition] = value + def add_value(self, condition: Union[str, 'MSCondition'], value: float) -> None: + """Add a value for a specific condition. - def get_value(self, condition, normalization=None): + Args: + condition: MSCondition object or condition ID string + value: The expression value to store + """ + # Resolve condition to condition_id + if isinstance(condition, str): + condition_id = condition + else: + condition_id = condition.id + + # Ensure feature row exists in parent DataFrame + if self.id not in self.parent._data.index: + self.parent._data.loc[self.id, :] = np.nan + + # Ensure condition column exists in parent DataFrame + if condition_id not in self.parent._data.columns: + self.parent._data[condition_id] = np.nan + + # Set the value + self.parent._data.loc[self.id, condition_id] = value + + def get_value(self, condition: Union[str, 'MSCondition'], convert_to_relative_abundance: bool = False) -> Optional[float]: + """Get the expression value for a specific condition. + + Args: + condition: MSCondition object or condition ID string + convert_to_relative_abundance: If True, convert value to relative abundance + + Returns: + The expression value, or None if not found + """ + # Resolve condition to condition_id and condition object if isinstance(condition, str): if condition not in self.parent.conditions: logger.warning( "Condition " + condition + " not found in expression object!" ) return None - condition = self.parent.conditions.get_by_id(condition) - if condition not in self.values: + condition_id = condition + condition_obj = self.parent.conditions.get_by_id(condition) + else: + condition_id = condition.id + condition_obj = condition + + # Check if value exists in DataFrame + if self.id not in self.parent._data.index or condition_id not in self.parent._data.columns: logger.info( - "Condition " + condition.id + " has no value in " + self.feature.id + "Condition " + condition_id + " has no value in " + self.feature.id ) return None - if normalization == COLUMN_NORM: - return self.values[condition] / condition.column_sum - return self.values[condition] + # Get value from DataFrame and convert NaN to None + value = self.parent._data.loc[self.id, condition_id] + + # Handle duplicate indices - if loc returns a Series, take the last value + if isinstance(value, pd.Series): + value = value.iloc[-1] + + if pd.isna(value): + logger.info( + "Condition " + condition_id + " has no value in " + self.feature.id + ) + return None + + # Apply relative abundance conversion if requested + if convert_to_relative_abundance: + if self.parent.type == "AbsoluteAbundance": + value = value / condition_obj.column_sum + elif self.parent.type == "FPKM": + value = value / condition_obj.column_sum + elif self.parent.type == "TPM": + value = value / condition_obj.column_sum + elif self.parent.type == "Log2": + value = 2 ** (value - condition_obj.lowest_value()) / 2 ** (condition_obj.column_sum - self.parent.features.len() * condition_obj.lowest_value()) + + return value class MSExpression: def __init__(self, type): - self.type = type + self.type = type # RelativeAbundance, AbsoluteAbundance, FPKM, TPM, Log2, NormalizedRatios self.object = None self.features = DictList() self.conditions = DictList() + self._data = pd.DataFrame() + self._data.index.name = 'feature_id' @staticmethod - def from_gene_feature_file(filename, genome=None, create_missing_features=False): - expression = MSExpression(GENOME) - if genome == None: + def from_msexpression(msexpression: 'MSExpression') -> 'MSExpression': + """Create a copy of an existing MSExpression object. + + Args: + msexpression: The MSExpression object to copy + + Returns: + A new MSExpression object with the same data + """ + new_expression = MSExpression(msexpression.type) + new_expression.object = msexpression.object + # Copy features + for feature in msexpression.features: + new_expression.features.append(MSExpressionFeature(feature.feature, new_expression)) + # Copy conditions + for condition in msexpression.conditions: + new_expression.conditions.append(MSCondition(condition.id, new_expression)) + # Copy data DataFrame + new_expression._data = msexpression._data.copy() + return new_expression + + def from_dataframe( + df: pd.DataFrame, + genome_or_model: Union['MSGenome', 'Model'], + create_missing_features: bool = False, + ignore_columns: list = None, + description_column: Optional[str] = None, + id_column: Optional[str] = None, + id_translation: Optional[dict] = None, + type: str = "RelativeAbundance" + ) -> 'MSExpression': + """Create an MSExpression object from a pandas DataFrame. + + Args: + df: DataFrame with feature IDs and condition values + genome: MSGenome object (optional) + create_missing_features: If True, create features not in genome + ignore_columns: List of column names to ignore + description_column: Name of column containing descriptions + id_column: Name of column containing feature IDs (default: first column) + type: Expression data type (RelativeAbundance, AbsoluteAbundance, FPKM, TPM, Log2) + + Returns: + MSExpression object with data loaded from DataFrame + """ + if ignore_columns is None: + ignore_columns = [] + + expression = MSExpression(type) + if genome_or_model is None: expression.object = MSGenome() create_missing_features = True else: - expression.object = genome - data = "" - with open(filename, "r") as file: - data = file.read() - lines = data.split("\n") - conditions = None - for line in lines: - if conditions == None: - conditions = [] - headers = line.split("\t") - for i in range(1, len(headers)): - if headers[i] not in expression.conditions: - conditions.append(MSCondition(headers[i])) - expression.conditions.append(conditions[i - 1]) - else: - conditions.append(self.conditions.get_by_id(headers[i])) - conditions[i - 1].column_sum = 0 - conditions[i - 1].feature_count = 0 + expression.object = genome_or_model + + # Identify columns + headers = list(df.columns) + if id_column is None: + id_column = headers[0] + print(id_column) + print(headers) + + # Identify condition columns + conditions = [] + description_present = description_column is not None and description_column in headers + + for header in headers: + if header == id_column: + continue + elif header == description_column: + continue + elif header not in ignore_columns: + conditions.append(header) + if header not in expression.conditions: + expression.conditions.append(MSCondition(header, expression)) + # Initialize metadata attributes + expression.conditions.get_by_id(header).column_sum = 0 + expression.conditions.get_by_id(header).feature_count = 0 + + # Add features to the expression object + valid_feature_ids = [] + for index, row in df.iterrows(): + gene_id = row[id_column] + if id_translation is not None and gene_id in id_translation: + gene_id = id_translation[gene_id] + description = None + if description_present: + description = row[description_column] + protfeature = expression.add_feature( + gene_id, create_missing_features, description=description + ) + if protfeature is not None: + valid_feature_ids.append(protfeature.id) + + # Bulk load data into DataFrame + if len(valid_feature_ids) > 0 and len(conditions) > 0: + # Apply ID translation to the dataframe's ID column if provided + if id_translation is not None: + # Create a translated version of the ID column for filtering and indexing + df_translated = df.copy() + df_translated['_translated_id'] = df_translated[id_column].map( + lambda x: id_translation.get(x, x) + ) + # Filter using translated IDs + data_df = df_translated[df_translated['_translated_id'].isin(valid_feature_ids)].copy() + # Set index using translated IDs + data_df = data_df.set_index('_translated_id') else: - array = line.split("\t") - protfeature = expression.add_feature(array[0], create_missing_features) - if protfeature != None: - for i in range(1, len(array)): - protfeature.add_value(conditions[i - 1], float(array[i])) + # Extract numeric data columns without translation + data_df = df[df[id_column].isin(valid_feature_ids)].copy() + data_df = data_df.set_index(id_column) + data_df = data_df[conditions] + + # Convert to numeric, coercing errors to NaN + for col in conditions: + data_df[col] = pd.to_numeric(data_df[col], errors='coerce') + + # Assign to expression._data + expression._data = data_df + expression._data.index.name = 'feature_id' + return expression + + @staticmethod + def from_spreadsheet( + filename: str, + sheet_name: Union[str, int] = 0, + skiprows: int = 0, + genome_or_model: Union['MSGenome', 'Model'] = None, + create_missing_features: bool = True, + id_translation: Optional[dict] = None, + ignore_columns: list = None, + description_column: Optional[str] = None, + id_column: Optional[str] = None, + type: str = "RelativeAbundance" + ) -> 'MSExpression': + """Create an MSExpression object from an Excel spreadsheet. + + Args: + filename: Path to Excel file + sheet_name: Sheet name or index (default: 0) + skiprows: Number of rows to skip at start + genome: MSGenome object (optional) + create_missing_features: If True, create features not in genome + ignore_columns: List of column names to ignore + description_column: Name of column containing descriptions + id_column: Name of column containing feature IDs (default: first column) + type: Expression data type (RelativeAbundance, AbsoluteAbundance, FPKM, TPM, Log2) + + Returns: + MSExpression object with data loaded from spreadsheet + """ + df = pd.read_excel(filename, sheet_name=sheet_name, skiprows=skiprows) + return MSExpression.from_dataframe( + df, + genome_or_model=genome_or_model, + create_missing_features=create_missing_features, + ignore_columns=ignore_columns, + description_column=description_column, + id_column=id_column, + id_translation=id_translation, + type=type + ) + + @staticmethod + def from_gene_feature_file( + filename: str, + genome: Optional['MSGenome'] = None, + create_missing_features: bool = False, + ignore_columns: list = None, + description_column: Optional[str] = None, + sep: str = "\t", + id_column: Optional[str] = None, + type: str = "RelativeAbundance" + ) -> 'MSExpression': + """Create an MSExpression object from a delimited text file. + + Args: + filename: Path to delimited file + genome: MSGenome object (optional) + create_missing_features: If True, create features not in genome + ignore_columns: List of column names to ignore + description_column: Name of column containing descriptions + sep: Field delimiter (default: tab) + id_column: Name of column containing feature IDs (default: first column) + type: Expression data type (RelativeAbundance, AbsoluteAbundance, FPKM, TPM, Log2) + + Returns: + MSExpression object with data loaded from file + """ + df = pd.read_csv(filename, sep=sep) + return MSExpression.from_dataframe( + df, + genome=genome, + create_missing_features=create_missing_features, + ignore_columns=ignore_columns, + description_column=description_column, + id_column=id_column, + type=type + ) + + @staticmethod + def load_from_dict( + data_dict: dict, + genome_or_model: Union['MSGenome', 'Model'], + value_type: str, + create_missing_features: bool = False + ) -> 'MSExpression': + """Create an MSExpression object from a dictionary. + + The dictionary should have feature IDs as keys and nested dictionaries + mapping condition IDs to expression values as values. + + Example: + data = { + "gene1": { + "condition1": 10.5, + "condition2": 20.3 + }, + "gene2": { + "condition1": 8.2, + "condition2": 15.7 + } + } + expr = MSExpression.load_from_dict( + data, + genome_or_model=genome, + value_type="Log2" + ) + + Args: + data_dict: Dictionary with feature IDs as keys and condition-value + dictionaries as values + genome_or_model: MSGenome object (for gene expression) or Model + object (for reaction expression). Required. + value_type: Expression data type (RelativeAbundance, AbsoluteAbundance, + FPKM, TPM, Log2, NormalizedRatios). Required. + create_missing_features: If True, create features not in genome/model + + Returns: + MSExpression object with data loaded from dictionary + """ + # Create expression object + expression = MSExpression(value_type) + expression.object = genome_or_model + + # Convert dictionary to DataFrame + # The dictionary format is: {feature_id: {condition_id: value, ...}, ...} + if not data_dict: + return expression + + # Convert to DataFrame with feature IDs as index and conditions as columns + data_df = pd.DataFrame.from_dict(data_dict, orient='index').T - def add_feature(self, id, create_gene_if_missing=False): + if 'Feature ID' not in data_df.columns: + # If 'Feature ID' is the index, reset it + data_df = data_df.reset_index() + if 'index' in data_df.columns: + data_df = data_df.rename(columns={'index': 'Feature ID'}) + + return MSExpression.from_dataframe( + df=data_df, + genome_or_model=genome_or_model, + create_missing_features=create_missing_features, + type=value_type, + id_column='Feature ID' + ) + + def add_feature( + self, + id: str, + create_gene_if_missing: bool = False, + description: Optional[str] = None + ) -> Optional['MSExpressionFeature']: + """Add a feature to the expression object. + + Args: + id: Feature ID + create_gene_if_missing: If True, create the gene in genome if missing + description: Optional feature description + + Returns: + MSExpressionFeature object, or None if feature not found + """ if id in self.features: return self.features.get_by_id(id) feature = None - if self.type == GENOME: - if self.object.search_for_gene(id) == None: + # Check if object is MSGenome (gene expression) or Model (reaction expression) + if isinstance(self.object, MSGenome): + if self.object.search_for_gene(id) is None: if create_gene_if_missing: self.object.features.append(MSFeature(id, "")) feature = self.object.search_for_gene(id) else: - if id in self.object.reactions: + # Assume it's a COBRApy Model with reactions + if hasattr(self.object, 'reactions') and id in self.object.reactions: feature = self.object.reactions.get_by_id(id) - if feature == None: + elif hasattr(self.object.model, 'reactions') and id in self.object.model.reactions: + feature = self.object.model.reactions.get_by_id(id) + if feature is None: logger.warning( "Feature referred by expression " + id + " not found in genome object!" ) @@ -162,7 +556,20 @@ def add_feature(self, id, create_gene_if_missing=False): self.features.append(protfeature) return protfeature - def get_value(self, feature, condition, normalization=None): + def get_value( + self, + feature: Union[str, 'MSExpressionFeature'], + condition: Union[str, 'MSCondition'] + ) -> Optional[float]: + """Get expression value for a feature and condition. + + Args: + feature: MSExpressionFeature object or feature ID string + condition: MSCondition object or condition ID string + + Returns: + The expression value, or None if not found + """ if isinstance(feature, str): if feature not in self.features: logger.warning( @@ -170,46 +577,716 @@ def get_value(self, feature, condition, normalization=None): ) return None feature = self.features.get_by_id(feature) - return feature.get_value(condition, normalization) + return feature.get_value(condition) - def build_reaction_expression(self, model, default): - if self.type == MODEL: - logger.critical( - "Cannot build a reaction expression from a model-based expression object!" - ) + def build_reaction_expression(self, model, default: Optional[float] = None) -> 'MSExpression': + """Build reaction-level expression from gene-level expression using GPR rules. + + Args: + model: COBRApy Model object + default: Default value for missing genes + + Returns: + MSExpression object with reaction-level expression data + """ # Creating the expression and features - rxnexpression = MSExpression(MODEL) + rxnexpression = MSExpression(self.type) rxnexpression.object = model for rxn in model.reactions: if len(rxn.genes) > 0: rxnexpression.add_feature(rxn.id) for condition in self.conditions: - rxnexpression.conditions.append(condition) - # Pulling the gene values from the current expression + newcondition = MSCondition(condition.id, rxnexpression) + rxnexpression.conditions.append(newcondition) + + # Pulling the gene values from the current expression using DataFrame values = {} - logger.warning("TESTING!") for gene in model.genes: - feature = self.object.search_for_gene(gene.id) - if feature == None: - logger.warning( - "Model gene " + gene.id + " not found in genome of expression" - ) - elif feature.id not in self.features: - logger.warning( - "Model gene " + gene.id + " in genome but not in expression" - ) + # First, try to find the gene directly in expression features + # This handles cases where expression was loaded without a genome + if gene.id in self.features: + feature = self.features.get_by_id(gene.id) else: + # Fallback: search through the genome object (supports aliases) + feature = self.object.search_for_gene(gene.id) + if feature is None: + logger.debug( + "Model gene " + gene.id + " not found in genome or expression" + ) + continue + if feature.id not in self.features: + logger.debug( + "Model gene " + gene.id + " in genome but not in expression" + ) + continue feature = self.features.get_by_id(feature.id) - for condition in self.conditions: - if condition.id not in values: - values[condition.id] = {} - if condition in feature.values: - values[condition.id][gene.id] = feature.values[condition] + + # Extract expression values for this feature + for condition in self.conditions: + if condition.id not in values: + values[condition.id] = {} + # Get value from DataFrame instead of feature.values dictionary + if feature.id in self._data.index and condition.id in self._data.columns: + value = self._data.loc[feature.id, condition.id] + if not pd.isna(value): + values[condition.id][gene.id] = value + # Computing the reaction level values for condition in rxnexpression.conditions: for feature in rxnexpression.features: - tree = parse_gpr(feature.feature.gene_reaction_rule)[0] + tree = GPR().from_string(str(feature.feature.gene_reaction_rule)) feature.add_value( - condition, compute_gene_score(tree, values[condition.id], default) + condition, compute_gene_score(tree, values[condition.id], default, self.type) ) return rxnexpression + + def get_dataframe(self, reset_index: bool = False) -> pd.DataFrame: + """Get a DataFrame with expression data. + + Args: + reset_index: If True, move feature_id from index to column (default: False) + + Returns: + DataFrame with feature IDs as index (or column if reset_index=True) + and conditions as columns + """ + if reset_index: + return self._data.reset_index() + else: + return self._data.copy() + + def translate_data(self, target_type: str) -> 'MSExpression': + """Translate expression data to a different type. + + Args: + target_type: Target expression type (RelativeAbundance, AbsoluteAbundance, FPKM, TPM, Log2) + + Returns: + New MSExpression object with translated data + """ + # Create a copy of the current expression object + new_expression = MSExpression.from_msexpression(self) + new_expression.type = target_type + # Perform translation based on source and target types + for condition in self.conditions: + denominator = None + for feature in self.features: + value = feature.get_value(condition) + if value is not None: + if self.type == "AbsoluteAbundance" or self.type == "FPKM" or self.type == "TPM": + if target_type == "RelativeAbundance": + if condition.sum_value() > 0.01: + transformed_value = value / condition.sum_value() + else: + transformed_value = 0 + elif target_type == "NormalizedRatios": + if condition.highest_value() > 0.01: + transformed_value = value / condition.highest_value() + else: + transformed_value = 0 + else: + raise ValueError( + f"Translation from {self.type} to {target_type} not supported" + ) + new_expression._data.loc[feature.id, condition.id] = transformed_value + elif self.type == "Log2": + if target_type == "RelativeAbundance": + if denominator is None: + denominator = 0 + for ftr in self.features: + denominator += 2 ** ftr.get_value(condition) + numerator = 2 ** value + if denominator > 0.01: + transformed_value = numerator / denominator + else: + transformed_value = 0 + else: + raise ValueError( + f"Translation from {self.type} to {target_type} not supported" + ) + new_expression._data.loc[feature.id, condition.id] = transformed_value + else: + raise ValueError( + f"Translation from {self.type} to {target_type} not supported" + ) + + return new_expression + + def average_expression_replicates(self, strain_list: list) -> 'MSExpression': + """Average expression replicates for each strain. + + Takes an MSExpression object with replicate columns (e.g., ACN2586_1, ACN2586_2, ...) + and averages them to create single columns per strain (e.g., ACN2586). + + Args: + strain_list: List of strain names (e.g., ["ACN2586", "ACN2821", ...]) + + Returns: + New MSExpression object with averaged data per strain + + Raises: + ValueError: If no data found for any strain in the list + """ + try: + # Access the underlying DataFrame + expression_df = self._data.copy() + + # Create new DataFrame for averaged data + averaged_data = {} + + # Keep the index (gene/protein IDs) + averaged_data['index'] = expression_df.index + + # For each strain, find and average its replicates + for strain in strain_list: + # Find columns that match this strain pattern (e.g., ACN2586_1, ACN2586_2, ...) + replicate_cols = [col for col in expression_df.columns if col.startswith(f"{strain}_")] + + if replicate_cols: + # Average the replicates + averaged_data[strain] = expression_df[replicate_cols].mean(axis=1) + logger.info(f"Averaged {len(replicate_cols)} replicates for strain {strain}") + else: + # No replicates found - check if strain column exists as-is + if strain in expression_df.columns: + averaged_data[strain] = expression_df[strain] + logger.info(f"No replicates found for {strain}, using existing column") + else: + logger.warning(f"No data found for strain {strain}") + + # Create new DataFrame from averaged data + averaged_df = pd.DataFrame(averaged_data) + averaged_df.set_index('index', inplace=True) + + # Create a deep copy of the expression object + averaged_expression = copy.deepcopy(self) + + # Replace the data with averaged data + averaged_expression._data = averaged_df + + # Update conditions list to match new columns + # Clear and rebuild conditions using proper MSCondition class + averaged_expression.conditions = DictList() + for strain in strain_list: + if strain in averaged_df.columns: + condition = MSCondition(strain, averaged_expression) + averaged_expression.conditions.append(condition) + + logger.info(f"Created averaged expression data with {len(averaged_expression.conditions)} conditions") + + return averaged_expression + + except Exception as e: + logger.error(f"Error averaging expression replicates: {str(e)}") + raise + + def fit_model_expression_to_data( + self, + model: 'MSModelUtil', + condition: str, + default_coef: float = 0.00001, + activation_threshold: float = None, # cshenry 10/16/2026: Changed default for activation to None so activation will be off by default + deactivation_threshold: float = 0.000001, + on_coef_override: float = None, + off_coef_override: float = None, + analyze_solution_essentiality: bool = False + ) -> Solution: + """Fit metabolic model fluxes to expression data using threshold-based constraints. + + This function integrates gene or reaction expression data with a metabolic model + to predict fluxes that are consistent with observed expression patterns. Reactions + with high expression are encouraged (activated), while reactions with low expression + are discouraged (deactivated). + + The function automatically handles: + - Conversion from gene-level to reaction-level expression (if needed) + - Transformation of expression data types to RelativeAbundance + - Construction of activation/deactivation dictionaries + - Integration with ExpressionActivationPkg for constrained optimization + + Parameters + ---------- + model : MSModelUtil + The metabolic model to fit to expression data. Must contain the same reactions + as the expression model (if expression is already at reaction level). + condition : str + The condition ID whose expression values will be used for fitting. Must exist + in the expression data conditions. + activation_threshold : float, optional + Minimum relative abundance value for a reaction to be considered "active". + Reactions with expression above this threshold will be encouraged in the + optimization. Default: 0.002 (0.2% relative abundance). + deactivation_threshold : float, optional + Maximum relative abundance value for a reaction to be considered "inactive". + Reactions with expression below this threshold will be discouraged in the + optimization. Default: 0.000001 (0.0001% relative abundance). + + Returns + ------- + cobra.Solution + The optimized FBA solution with expression-based flux constraints. Contains: + - objective_value: The optimized objective value + - fluxes: pandas Series of reaction fluxes + - status: Optimization status ('optimal', 'infeasible', etc.) + + Raises + ------ + ValueError + If the expression model does not match the input model (when expression is + already at reaction level). + ValueError + If the specified condition does not exist in the expression data. + ValueError + If activation_threshold is less than or equal to deactivation_threshold. + ValueError + If no reactions have expression values above the activation threshold. + RuntimeError + If the optimization fails or produces an infeasible solution. + + Notes + ----- + - This function does NOT modify the original expression data or model + - All model modifications occur within a context manager and are reverted + - Reactions without expression data or reactions between the specified thresholds are deactivated base on the default_coef argument + - The function uses ExpressionActivationPkg internally for constraint building + + See Also + -------- + MSExpression.build_reaction_expression : Convert gene to reaction expression + ExpressionActivationPkg.build_package : Build expression-based constraints + MSModelUtil.test_single_condition : Run FBA on a single condition + + References + ---------- + .. [1] Becker, S. A., & Palsson, B. O. (2008). Context-specific metabolic networks + are consistent with experiments. PLoS computational biology, 4(5), e1000082. + """ + # cshenry 10/16/2026: Checking that model is MSModelUtil and converting if not + if not isinstance(model, MSModelUtil): + model = MSModelUtil(model) + + # Task 1.6: Initial logging + logger.info(f"Fitting model flux to expression or fitness data for condition: {condition}") + + # Task 1.4: Threshold validation + if activation_threshold is not None and activation_threshold <= deactivation_threshold: + raise ValueError( + f"activation_threshold ({activation_threshold}) must be greater than " + f"deactivation_threshold ({deactivation_threshold})" + ) + + # Task 1.5: Condition validation + if condition not in self.conditions: + available_conditions = [c.id for c in self.conditions] + raise ValueError( + f"Condition '{condition}' not found in expression data. " + f"Available conditions: {available_conditions}" + ) + + # Task 2.1-2.4: Genome vs model detection and conversion + if isinstance(self.object, MSGenome): + # Task 2.2: Genome-level expression - convert to reaction-level + rxn_expression = self.build_reaction_expression(model.model) + else: + # Task 2.3: Model-level expression - validate model match + expr_rxns = set(self.object.model.reactions.list_attr('id')) + model_rxns = set(model.model.reactions.list_attr('id')) + + if expr_rxns != model_rxns: + # Task 2.4: Model mismatch error + missing_in_expr = list(model_rxns - expr_rxns)[:10] + missing_in_model = list(expr_rxns - model_rxns)[:10] + raise ValueError( + f"Models must match when fitting flux to data. " + f"Expression model has {len(expr_rxns)} reactions, " + f"input model has {len(model_rxns)} reactions. " + f"Missing in expression: {missing_in_expr}. " + f"Missing in model: {missing_in_model}" + ) + + rxn_expression = self + + # Task 2.5-2.13: Expression type transformation + if rxn_expression.type != "RelativeAbundance" and rxn_expression.type != "NormalizedRatios": + raise ValueError( + f"Reaction expression must be in terms of relative abundance or normalized ratios" + ) + + # Initialize empty dictionaries + on_hash = {} + off_hash = {} + + # Iterate through reactions and build dictionaries + for rxn in model.model.reactions: + expr_value = rxn_expression.get_value(rxn.id, condition) + if expr_value is None: + continue + if rxn_expression.type == "NormalizedRatios": + if activation_threshold is not None and abs(1 - expr_value) >= activation_threshold: + on_hash[rxn.id] = 10 * (1 - expr_value) + elif abs(1-expr_value) <= deactivation_threshold: + off_hash[rxn.id] = 10*(1 - expr_value) + else: + if activation_threshold is not None and expr_value > activation_threshold: + if activation_threshold != 0: + on_hash[rxn.id] = (expr_value - activation_threshold)/activation_threshold + else: + on_hash[rxn.id] = expr_value+1 + elif expr_value < deactivation_threshold: + if deactivation_threshold != 0: + off_hash[rxn.id] = (deactivation_threshold - expr_value)/deactivation_threshold + else: + off_hash[rxn.id] = expr_value+1 + + print("On:", on_hash) + print("Off:", off_hash) + + # Log dictionary sizes + logger.info(f"Identified {len(on_hash)} reactions for activation (above threshold {activation_threshold})") + logger.info(f"Identified {len(off_hash)} reactions for deactivation (below threshold {deactivation_threshold})") + + # Access package manager + pkgmgr = model.pkgmgr + + # Get ExpressionActivationPkg + expr_pkg = pkgmgr.getpkg("ExpressionActivationPkg") + + # Use context manager for transient modifications + output = {"on_on":[],"on_off":[], "off_on":[], "off_off":[],"none_on":[],"none_off":[],"on_on_reduced":[],"off_on_reduced":[],"none_on_reduced":[],"solution":None} + original_objective = model.model.objective + with model.model: + # Task 4.4: Build package with dictionaries + expr_pkg.build_package(on_hash, off_hash, other_coef=default_coef, on_coeff=on_coef_override, off_coeff=off_coef_override) + # Task 4.5: Execute optimization + output["objective"] = model.model.objective + output["solution"] = model.model.optimize() + for rxn in model.model.reactions: + if rxn.id in on_hash: + if abs(output["solution"].fluxes[rxn.id]) > 1e-6: + output["on_on"].append(rxn.id) + else: + output["on_off"].append(rxn.id) + elif rxn.id in off_hash: + if abs(output["solution"].fluxes[rxn.id]) > 1e-6: + output["off_on"].append(rxn.id) + else: + output["off_off"].append(rxn.id) + else: + if abs(output["solution"].fluxes[rxn.id]) > 1e-6: + output["none_on"].append(rxn.id) + else: + output["none_off"].append(rxn.id) + + # Task 4.6: Validate solution status + if output["solution"].status != "optimal": + raise RuntimeError( + f"Optimization failed with status: {output['solution'].status}. " + f"The model may be infeasible with the given expression constraints." + ) + + # Task 4.7: Log optimization result + logger.info(f"Optimization completed with objective value: {output['solution'].objective_value}") + + # Categorize reactions by flux + zero_flux_rxns = [] + active_rxns = [] + + for rxn_id, flux in output["solution"].fluxes.items(): + if rxn_id not in [r.id for r in model.model.reactions]: + continue + if abs(flux) <= 1e-9: + zero_flux_rxns.append(rxn_id) + else: + active_rxns.append((rxn_id, flux)) + + print(f" Zero-flux reactions: {len(zero_flux_rxns)}") + print(f" Active reactions: {len(active_rxns)}") + + with model.model: + #model.model.objective = original_objective + # Set zero-flux reactions to have zero bounds + for rxn_id in zero_flux_rxns: + rxn = model.model.reactions.get_by_id(rxn_id) + rxn.lower_bound = 0 + rxn.upper_bound = 0 + + # Get baseline growth with constrained model + output["baseline_growth"] = model.model.optimize().objective_value + + # Test each active reaction knockout + essentiality_results = {} + essential_count = 0 + reduced_count = 0 + + for rxn_id, original_flux in active_rxns: + rxn = model.model.reactions.get_by_id(rxn_id) + + # Save original bounds + orig_lb = rxn.lower_bound + orig_ub = rxn.upper_bound + + # Knock out the reaction + rxn.lower_bound = 0 + rxn.upper_bound = 0 + + # Optimize + ko_solution = model.model.optimize() + + if ko_solution.status == 'optimal': + ko_growth = ko_solution.objective_value + growth_ratio = ko_growth / baseline_growth if baseline_growth > 0 else 0 + else: + ko_growth = 0 + growth_ratio = 0 + + # Categorize impact + if growth_ratio < 0.01: + impact = "essential" + essential_count += 1 + elif growth_ratio < 0.95: + impact = "reduced" + reduced_count += 1 + else: + impact = "dispensable" + + essentiality_results[rxn_id] = { + "expression_data_status":"none", + "original_flux": original_flux, + "ko_growth": ko_growth, + "growth_ratio": growth_ratio, + "impact": impact + } + if rxn_id in on_hash: + essentiality_results[rxn_id]["expression_data_status"] = "on" + if growth_ratio < 0.95: + output["on_on_reduced"].append(rxn_id) + elif rxn_id in off_hash: + essentiality_results[rxn_id]["expression_data_status"] = "off" + if growth_ratio < 0.95: + output["off_on_reduced"].append(rxn_id) + else: + essentiality_results[rxn_id]["expression_data_status"] = "none" + if growth_ratio < 0.95: + output["none_on_reduced"].append(rxn_id) + + # Restore original bounds + rxn.lower_bound = orig_lb + rxn.upper_bound = orig_ub + + print(f" Essential reactions: {essential_count}") + print(f" Reduced growth reactions: {reduced_count}") + print(f" Dispensable reactions: {len(active_rxns) - essential_count - reduced_count}") + + output["baseline_growth"] = baseline_growth + output["zero_flux_count"] = len(zero_flux_rxns) + output["active_count"] = len(active_rxns) + output["essential_count"] = essential_count + output["reduced_count"] = reduced_count + output["reactions"] = essentiality_results + + # Task 4.8: Return solution + return output + + + def fit_flux_to_mutant_growth_rate_data( + self, + model: 'MSModelUtil', + condition: str, + default_coef: float = 0.00001, + activation_threshold: float = 0.90, + deactivation_threshold: float = 0.95, + on_coef_override: float = None, + off_coef_override: float = None + ) -> Solution: + """Fit metabolic model fluxes to mutant growth rate data using threshold-based constraints + """ + if not isinstance(model, MSModelUtil): + model = MSModelUtil(model) + + logger.info(f"Fitting model flux to mutant growth rate data for condition: {condition}") + + if activation_threshold >= deactivation_threshold: + raise ValueError( + f"activation_threshold ({activation_threshold}) must be less than " + f"deactivation_threshold ({deactivation_threshold})" + ) + + if condition not in self.conditions: + available_conditions = [c.id for c in self.conditions] + raise ValueError( + f"Condition '{condition}' not found in expression data. " + f"Available conditions: {available_conditions}" + ) + + if self.type != "NormalizedRatios": + raise ValueError( + f"Expression must be in terms of normalized ratios" + ) + + # Initialize empty dictionaries + on_hash = {} + off_hash = {} + + # Iterate through reactions and build dictionaries + for rxn in model.model.reactions: + # Check if the reaction ID is in the expression data + if rxn.id in self.features: + expr_value = self.get_value(rxn.id, condition) + else: + lowest_value = None + for gene_id in rxn.genes: + if gene_id in self.features: + expr_value = self.get_value(gene_id, condition) + if lowest_value is None or expr_value < lowest_value: + lowest_value = expr_value + if lowest_value is None: + expr_value = None + else: + expr_value = lowest_value + if expr_value is None: + continue + if expr_value <= activation_threshold: + on_hash[rxn.id] = 1 + 10 * (activation_threshold - expr_value) + elif expr_value >= deactivation_threshold: + off_hash[rxn.id] = 1 + 10 * (expr_value-deactivation_threshold) + + print("On:", on_hash) + print("Off:", off_hash) + + # Log dictionary sizes + logger.info(f"Identified {len(on_hash)} reactions for activation (above threshold {activation_threshold})") + logger.info(f"Identified {len(off_hash)} reactions for deactivation (below threshold {deactivation_threshold})") + + # Get ExpressionActivationPkg + expr_pkg = model.pkgmgr.getpkg("ExpressionActivationPkg") + + # Use context manager for transient modifications + output = {"on_on":[],"on_off":[], "off_on":[], "off_off":[],"none_on":[],"none_off":[],"on_on_reduced":[],"off_on_reduced":[],"none_on_reduced":[],"solution":None} + original_objective = model.model.objective + with model.model: + expr_pkg.build_package(on_hash, off_hash, other_coef=default_coef, on_coeff=on_coef_override, off_coeff=off_coef_override) + output["solution"] = model.model.optimize() + for rxn in model.model.reactions: + if rxn.id in on_hash: + if abs(output["solution"].fluxes[rxn.id]) > 1e-6: + output["on_on"].append(rxn.id) + else: + output["on_off"].append(rxn.id) + elif rxn.id in off_hash: + if abs(output["solution"].fluxes[rxn.id]) > 1e-6: + output["off_on"].append(rxn.id) + else: + output["off_off"].append(rxn.id) + else: + if abs(output["solution"].fluxes[rxn.id]) > 1e-6: + output["none_on"].append(rxn.id) + else: + output["none_off"].append(rxn.id) + + if output["solution"].status != "optimal": + raise RuntimeError( + f"Optimization failed with status: {output['solution'].status}. " + f"The model may be infeasible with the given expression constraints." + ) + + logger.info(f"Optimization completed with objective value: {output['solution'].objective_value}") + + # Categorize reactions by flux + zero_flux_rxns = [] + active_rxns = [] + + for rxn_id, flux in output["solution"].fluxes.items(): + if rxn_id not in [r.id for r in model.model.reactions]: + continue + if abs(flux) <= 1e-9: + zero_flux_rxns.append(rxn_id) + else: + active_rxns.append((rxn_id, flux)) + + print(f" Zero-flux reactions: {len(zero_flux_rxns)}") + print(f" Active reactions: {len(active_rxns)}") + + with model.model: + #model.model.objective = original_objective + # Set zero-flux reactions to have zero bounds + for rxn_id in zero_flux_rxns: + rxn = model.model.reactions.get_by_id(rxn_id) + rxn.lower_bound = 0 + rxn.upper_bound = 0 + + # Get baseline growth with constrained model + output["baseline_growth"] = model.model.optimize().objective_value + + # Test each active reaction knockout + essentiality_results = {} + essential_count = 0 + reduced_count = 0 + + for rxn_id, original_flux in active_rxns: + rxn = model.model.reactions.get_by_id(rxn_id) + + # Save original bounds + orig_lb = rxn.lower_bound + orig_ub = rxn.upper_bound + + # Knock out the reaction + rxn.lower_bound = 0 + rxn.upper_bound = 0 + + # Optimize + ko_solution = model.model.optimize() + + if ko_solution.status == 'optimal': + ko_growth = ko_solution.objective_value + growth_ratio = ko_growth / baseline_growth if baseline_growth > 0 else 0 + else: + ko_growth = 0 + growth_ratio = 0 + + # Categorize impact + if growth_ratio < 0.01: + impact = "essential" + essential_count += 1 + elif growth_ratio < 0.95: + impact = "reduced" + reduced_count += 1 + else: + impact = "dispensable" + + essentiality_results[rxn_id] = { + "expression_data_status":"none", + "original_flux": original_flux, + "ko_growth": ko_growth, + "growth_ratio": growth_ratio, + "impact": impact + } + if rxn_id in on_hash: + essentiality_results[rxn_id]["expression_data_status"] = "on" + if growth_ratio < 0.95: + output["on_on_reduced"].append(rxn_id) + elif rxn_id in off_hash: + essentiality_results[rxn_id]["expression_data_status"] = "off" + if growth_ratio < 0.95: + output["off_on_reduced"].append(rxn_id) + else: + essentiality_results[rxn_id]["expression_data_status"] = "none" + if growth_ratio < 0.95: + output["none_on_reduced"].append(rxn_id) + + # Restore original bounds + rxn.lower_bound = orig_lb + rxn.upper_bound = orig_ub + + print(f" Essential reactions: {essential_count}") + print(f" Reduced growth reactions: {reduced_count}") + print(f" Dispensable reactions: {len(active_rxns) - essential_count - reduced_count}") + + output["baseline_growth"] = baseline_growth + output["zero_flux_count"] = len(zero_flux_rxns) + output["active_count"] = len(active_rxns) + output["essential_count"] = essential_count + output["reduced_count"] = reduced_count + output["reactions"] = essentiality_results + + # Task 4.8: Return solution + return output diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..8e0e52df --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,27 @@ +[build-system] +requires = [ + 'setuptools>=40.6.0', + 'wheel' +] +build-backend = "setuptools.build_meta" + +[tool.black] +line-length = 88 +python-version = ['py38'] +include = '\.pyi?$' +exclude = ''' +( + /( + \.eggs # exclude a few common directories in the + | \.git # root of the project + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | _build + | buck-out + | build + | dist + )/ +) +''' diff --git a/setup.py b/setup.py index 775d60a4..f3c735ba 100644 --- a/setup.py +++ b/setup.py @@ -10,8 +10,9 @@ setup( name="ModelSEEDpy", - version="0.2.2", + version="0.4.2", description="Python package for building and analyzing models using ModelSEED", + long_description_content_type="text/x-rst", long_description=readme, author="Christopher Henry", author_email="chenry@anl.gov", @@ -19,23 +20,35 @@ license=license, packages=find_packages(exclude=("docs")), package_data={ - "modelseedpy": ["config.cfg"], + "modelseedpy": ["config.cfg", "data/*"], }, + classifiers=[ + "Development Status :: 3 - Alpha", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "Intended Audience :: Science/Research", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Natural Language :: English", + ], install_requires=[ "networkx >= 2.4", - "cobra >= 0.17.1", - "scikit-learn == 0.23.2", # too support KBase pickle models + "cobra >= 0.29.0", + "pandas >= 1.5.0", + "scikit-learn == 1.2.0", # version lock for pickle ML models "scipy >= 1.5.4", "chemicals >= 1.0.13", "chemw >= 0.3.2", "matplotlib >= 3.0.0", - "pyeda", + "Jinja2 >= 3.1.4", + "sympy >=1.12.0", ], tests_require=[ "pytest", ], project_urls={ - "Documentation": "https://modelseedpy.readthedocs.io/en/stable/", + "Documentation": "https://modelseedpy.readthedocs.io/en/latest/", "Issues": "https://github.com/ModelSEED/ModelSEEDpy/issues", }, ) diff --git a/tests/test_advanced.py b/tests/core/test_advanced.py similarity index 100% rename from tests/test_advanced.py rename to tests/core/test_advanced.py diff --git a/tests/test_basic.py b/tests/core/test_basic.py similarity index 100% rename from tests/test_basic.py rename to tests/core/test_basic.py diff --git a/tests/core/test_msatpcorreption.py b/tests/core/test_msatpcorreption.py index 13acf3c3..a60d33ec 100644 --- a/tests/core/test_msatpcorreption.py +++ b/tests/core/test_msatpcorreption.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import os import pytest import json import cobra @@ -8,13 +9,26 @@ @pytest.fixture def template(): - with open("./tests/test_data/template_core_bigg.json", "r") as fh: + with open( + os.path.join( + os.path.dirname(__file__), "..", "test_data", "template_core_bigg.json" + ), + "r", + ) as fh: return MSTemplateBuilder.from_dict(json.load(fh)).build() @pytest.fixture def template_genome_scale(): - with open("./tests/test_data/template_genome_scale_bigg.json", "r") as fh: + with open( + os.path.join( + os.path.dirname(__file__), + "..", + "test_data", + "template_genome_scale_bigg.json", + ), + "r", + ) as fh: return MSTemplateBuilder.from_dict(json.load(fh)).build() @@ -23,7 +37,12 @@ def get_model(): def _method(ko=None, added_compounds=None, added_reactions=None): if ko is None: ko = [] - with open("./tests/test_data/e_coli_core.json", "r") as fh: + with open( + os.path.join( + os.path.dirname(__file__), "..", "test_data", "e_coli_core.json" + ), + "r", + ) as fh: model_json = json.load(fh) model_json["compartments"] = { k + "0": v for (k, v) in model_json["compartments"].items() @@ -89,7 +108,7 @@ def media_acetate_aerobic(): "h2o": (-1000, 1000), } ) - media.id = "glc/o2" + media.id = "ac/o2" return media @@ -186,9 +205,14 @@ def test_infinite_atp_model_growth_boost( def test_ms_atp_correction1(get_model, template, media_all_aerobic): + atp_hydrolysis_id = "ATPM_c0" model = get_model(["GLCpts_c0", "NADH16_c0", "CYTBD_c0", "O2t_c0"]) atp_correction = MSATPCorrection( - model, template, media_all_aerobic, atp_hydrolysis_id="ATPM_c0" + model, + template, + media_all_aerobic, + atp_hydrolysis_id=atp_hydrolysis_id, + load_default_medias=False, ) atp_correction.evaluate_growth_media() assert len(atp_correction.noncore_reactions) == 1 # the biomass @@ -211,9 +235,14 @@ def test_ms_atp_correction1(get_model, template, media_all_aerobic): tests = atp_correction.build_tests() assert tests - assert len(tests) == 1 - assert tests[0]["threshold"] > 0 - assert tests[0]["objective"] == "ATPM_c0" + assert len(tests) == 2 # glucose and empty + for t in tests: + if t["media"].id == "empty": + assert t["threshold"] <= 1e-05 + else: + assert t["threshold"] > 1e-05 + assert t["objective"] == atp_hydrolysis_id + assert t["is_max_threshold"] is True def test_ms_atp_correction_and_gap_fill1( @@ -225,35 +254,45 @@ def test_ms_atp_correction_and_gap_fill1( ): from modelseedpy import MSGapfill + atp_hydrolysis_id = "ATPM_c0" + model = get_model_with_infinite_atp_loop(["GLCpts_c0", "GLUSy_c0", "GLUDy_c0"]) model.reactions.ATPM_c0.lower_bound = 0 model.reactions.ATPM_c0.upper_bound = 1000 - + model.objective = atp_hydrolysis_id atp_correction = MSATPCorrection( - model, template, [media_glucose_aerobic], atp_hydrolysis_id="ATPM_c0" + model, + template, + [media_glucose_aerobic], + atp_hydrolysis_id=atp_hydrolysis_id, + load_default_medias=False, ) tests = atp_correction.run_atp_correction() - # expected tests = [{'media': MSMedia object, 'is_max_threshold': True, 'threshold': 21.0, 'objective': 'ATPM_c0'}] assert tests - assert len(tests) == 1 - assert tests[0]["threshold"] > 0 - assert tests[0]["objective"] == "ATPM_c0" - + assert len(tests) == 2 + for t in tests: + if t["media"].id == "empty": + assert t["threshold"] <= 1e-05 + else: + assert t["threshold"] > 1e-05 + assert t["objective"] == atp_hydrolysis_id + assert t["is_max_threshold"] is True + + model.objective = "BIOMASS_Ecoli_core_w_GAM_c0" gap_fill = MSGapfill(model, [template_genome_scale], [], tests, {}, []) result = gap_fill.run_gapfilling( media_genome_scale_glucose_aerobic, "BIOMASS_Ecoli_core_w_GAM_c0", minimum_obj=0.1, ) - # either GLUSy_c0 or GLUDy_c0 should be gap filled for glutamate assert result assert len(result["new"]) == 1 assert "GLUSy_c0" in result["new"] or "GLUDy_c0" in result["new"] - model = gap_fill.integrate_gapfill_solution(result) + gap_fill.integrate_gapfill_solution(result) - assert model + # TODO: add some model testing assertion diff --git a/tests/core/test_msgapfill.py b/tests/core/test_msgapfill.py index 77238f59..622a0924 100644 --- a/tests/core/test_msgapfill.py +++ b/tests/core/test_msgapfill.py @@ -1,54 +1,5 @@ # -*- coding: utf-8 -*- -""" -from glob import glob -os.environ["HOME"] = 'C:\\Users\\Andrew Freiburger\\Dropbox\\My PC (DESKTOP-M302P50)\\Documents\\UVic Civil Engineering\\Internships\\Agronne\\cobrakbase' -import cobrakbase -token = 'xx' -kbase = cobrakbase.KBaseAPI(token) -import re - -# define the example individual model and associated API media package -model = kbase.get_from_ws('e_coli_core.kb', 95098) -model.solver = 'optlang-cplex' - -# import the modelseedpy packages -import modelseedpy -from modelseedpy.core.msgapfill import MSGapfill -gapfill = MSGapfill(model) - -def test_init(): - assert type(gapfill.model) is cobrakbase.core.kbasefba.fbamodel.FBAModel - assert type(gapfill.blacklist) is list - assert type(gapfill.solutions) is dict - -def test_run_gapfilling_and_integrate_gapfill_solution(): - solutions = gapfill.run_gapfilling() - - # test that the objective expression is correctly set - if solutions is not None: - assert type(solutions) is dict - - # verify the integrate_gapfill_solution function - model_2 = gapfill.integrate_gapfill_solution(solutions) - assert type(model_2) is cobrakbase.core.kbasefba.fbamodel.FBAModel - - for reaction in solutions['reversed']: - if solution["reversed"][reaction] == ">": - assert reaction.upper_bound == 100 - else: - assert reaction.lower_bound == -100 - - for reaction in solutions['new']: - if solution["new"][reaction] == ">": - assert reaction.upper_bound == 100 - assert reaction.lower_bound == 0 - else: - assert reaction.upper_bound == 0 - assert reaction.lower_bound == -100 - -def test_gapfill(): - pass -""" +import os import pytest import json import cobra @@ -58,7 +9,12 @@ def test_gapfill(): @pytest.fixture def template(): - with open("./tests/test_data/template_core_bigg.json", "r") as fh: + with open( + os.path.join( + os.path.dirname(__file__), "..", "test_data", "template_core_bigg.json" + ), + "r", + ) as fh: return MSTemplateBuilder.from_dict(json.load(fh)).build() @@ -67,7 +23,12 @@ def get_model(): def _method(ko=None): if ko is None: ko = [] - with open("./tests/test_data/e_coli_core.json", "r") as fh: + with open( + os.path.join( + os.path.dirname(__file__), "..", "test_data", "e_coli_core.json" + ), + "r", + ) as fh: model_json = json.load(fh) model_json["compartments"] = { k + "0": v for (k, v) in model_json["compartments"].items() diff --git a/tests/core/test_msmodel.py b/tests/core/test_msmodel.py new file mode 100644 index 00000000..ec4027f5 --- /dev/null +++ b/tests/core/test_msmodel.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- +from modelseedpy.core.msmodel import * + + +def test_get_direction_from_constraints1(): + res = get_direction_from_constraints(0, 1000) + + assert res == ">" + + +def test_get_direction_from_constraints2(): + res = get_direction_from_constraints(-1000, 0) + + assert res == "<" + + +def test_get_direction_from_constraints3(): + res = get_direction_from_constraints(-1000, 1000) + + assert res == "=" + + +def test_get_set_set1(): + res = get_set_set("A") + + assert len(res) == 1 + assert {"A"} in res + + +def test_get_set_set2(): + res = get_set_set("A and B") + + assert len(res) == 1 + assert {"A", "B"} in res + + +def test_get_set_set3(): + res = get_set_set("A or B") + + assert len(res) == 2 + assert {"A"} in res + assert {"B"} in res + + +def test_get_set_set4(): + res = get_set_set("A or B or C") + + assert len(res) == 3 + assert {"A"} in res + assert {"B"} in res + assert {"C"} in res + + +def test_get_set_set5(): + res = get_set_set("A or B and C") + + assert len(res) == 2 + assert {"A"} in res + assert {"B", "C"} in res + + +def test_get_set_set6(): + res = get_set_set("A and B or C") + + assert len(res) == 2 + assert {"A", "B"} in res + assert {"C"} in res + + +def test_get_set_set7(): + res = get_set_set("(A or B) and C") + + assert len(res) == 2 + assert {"A", "C"} in res + assert {"B", "C"} in res + + +def test_get_set_set8(): + res = get_set_set("A and (B or C)") + + assert len(res) == 2 + assert {"A", "B"} in res + assert {"A", "C"} in res diff --git a/tests/core/test_mstemplate.py b/tests/core/test_mstemplate.py new file mode 100644 index 00000000..9663e8c8 --- /dev/null +++ b/tests/core/test_mstemplate.py @@ -0,0 +1,109 @@ +# -*- coding: utf-8 -*- +import pytest + +from modelseedpy.core.mstemplate import ( + MSTemplate, + MSTemplateMetabolite, + MSTemplateReaction, + MSTemplateSpecies, +) +from modelseedpy.core.mstemplate import ( + NewModelTemplateRole, + NewModelTemplateComplex, + MSTemplateCompartment, +) + + +@pytest.fixture +def empty_template(): + return MSTemplate("test", "test name", "test") + + +def test_empty_template(): + template = MSTemplate("test", "test name", "test") + assert template.id == "test" + assert template.name == "test name" + assert len(template.roles) == 0 + assert len(template.complexes) == 0 + assert len(template.compounds) == 0 + assert len(template.compcompounds) == 0 + assert len(template.reactions) == 0 + + +def test_template_add_role(empty_template): + role = NewModelTemplateRole("role1", "metabolic function") + empty_template.add_roles([role]) + assert len(empty_template.roles) == 1 + + +def test_template_add_role_mult(empty_template): + role_a = NewModelTemplateRole("roleA", "metabolic function A") + role_b = NewModelTemplateRole("roleB", "metabolic function B") + role_c = NewModelTemplateRole("roleC", "metabolic function C") + empty_template.add_roles([role_a, role_b, role_c]) + assert len(empty_template.roles) == 3 + + +def test_template_add_simple_complex(empty_template): + role = NewModelTemplateRole("role1", "metabolic function") + empty_template.add_roles([role]) + + seed_complex = NewModelTemplateComplex("complex1", "example complex") + + seed_complex.add_role(empty_template.roles.role1) + + empty_template.add_complexes([seed_complex]) + + assert len(empty_template.complexes) == 1 + + +def test_template_add_simple_metabolite(empty_template): + cpd_apple = MSTemplateMetabolite("apple", "C100", "just a apple") + empty_template.add_compounds([cpd_apple]) + + assert len(empty_template.compounds) == 1 + + +def test_template_add_simple_metabolite_species(empty_template): + cpd_apple = MSTemplateMetabolite("apple", "C100", "just a apple") + empty_template.add_compounds([cpd_apple]) + + comp_cpd_apple = MSTemplateSpecies("apple_k", 0, "k", "apple") + empty_template.add_comp_compounds([comp_cpd_apple]) + + assert len(empty_template.compounds) == 1 + assert len(empty_template.compcompounds) == 1 + assert empty_template.compcompounds.apple_k.compound + assert empty_template.compcompounds.apple_k.compound.name == "just a apple" + assert len(empty_template.compounds.apple.species) == 1 + + +def test_template_add_compartment(empty_template): + empty_template.compartments += [MSTemplateCompartment("w", "world", 4)] + + assert len(empty_template.compartments) == 1 + + +def test_template_add_reaction(empty_template): + cpd_apple = MSTemplateMetabolite("apple", "C100", "just a apple") + cpd_apple_pie = MSTemplateMetabolite("appie", "C1000", "apple pie (10 apples)") + empty_template.add_compounds([cpd_apple, cpd_apple_pie]) + + comp_cpd_apple = MSTemplateSpecies("apple_k", 0, "k", "apple") + comp_cpd_apple_pie = MSTemplateSpecies("appie_k", 0, "k", "appie") + empty_template.add_comp_compounds([comp_cpd_apple, comp_cpd_apple_pie]) + + rxn_make_pie = MSTemplateReaction( + "rxn_pie_k", "rxn00000", "make pie", "pie", 0, 1000 + ) + rxn_make_pie.add_metabolites( + { + empty_template.compcompounds.apple_k: -10, + empty_template.compcompounds.appie_k: 1, + } + ) + + empty_template.add_reactions([rxn_make_pie]) + + assert len(empty_template.reactions) == 1 + assert empty_template.reactions.rxn_pie_k.check_mass_balance() == {} diff --git a/tests/multiomics/__init__.py b/tests/multiomics/__init__.py new file mode 100644 index 00000000..766caf37 --- /dev/null +++ b/tests/multiomics/__init__.py @@ -0,0 +1,2 @@ +# -*- coding: utf-8 -*- +"""Tests for multiomics module.""" diff --git a/tests/multiomics/test_msexpression.py b/tests/multiomics/test_msexpression.py new file mode 100644 index 00000000..0dceabe5 --- /dev/null +++ b/tests/multiomics/test_msexpression.py @@ -0,0 +1,1110 @@ +# -*- coding: utf-8 -*- +""" +Comprehensive test suite for MSExpression refactoring. + +This test module covers the DataFrame-based refactoring of the MSExpression class, +ensuring all functionality works correctly with the new pandas-based data storage. +""" + +import unittest +import pandas as pd +import numpy as np +import tempfile +import os +from modelseedpy.multiomics.msexpression import ( + MSExpression, + MSExpressionFeature, + MSCondition, + compute_gene_score +) +from modelseedpy.core.msgenome import MSGenome, MSFeature +from cobra import Model, Reaction, Gene + + +class TestDataFrameInitialization(unittest.TestCase): + """TC-1: Test DataFrame initialization and structure.""" + + def test_msexpression_init_creates_empty_dataframe(self): + """Test that MSExpression.__init__() creates empty DataFrame with named index.""" + expr = MSExpression("RelativeAbundance") + self.assertIsInstance(expr._data, pd.DataFrame) + self.assertEqual(expr._data.index.name, 'feature_id') + self.assertEqual(len(expr._data), 0) + self.assertEqual(len(expr._data.columns), 0) + + def test_dataframe_index_name_is_feature_id(self): + """Test that DataFrame index is named 'feature_id'.""" + expr = MSExpression("FPKM") + self.assertEqual(expr._data.index.name, 'feature_id') + + def test_dataframe_is_private_attribute(self): + """Test that _data is a private attribute (single underscore).""" + expr = MSExpression("TPM") + self.assertTrue(hasattr(expr, '_data')) + self.assertIsInstance(expr._data, pd.DataFrame) + + +class TestDataLoadingFromDataFrame(unittest.TestCase): + """TC-2: Test data loading from DataFrame.""" + + def setUp(self): + """Create test data for loading tests.""" + self.test_df = pd.DataFrame({ + 'gene_id': ['gene1', 'gene2', 'gene3'], + 'condition1': [10.5, 8.2, 15.3], + 'condition2': [20.3, 15.7, 12.1], + 'condition3': [5.1, 3.4, 7.8] + }) + + # Create a genome with features + self.genome = MSGenome() + self.genome.features.append(MSFeature('gene1', '')) + self.genome.features.append(MSFeature('gene2', '')) + self.genome.features.append(MSFeature('gene3', '')) + + def test_from_dataframe_basic_loading(self): + """Test basic loading from DataFrame.""" + expr = MSExpression.from_dataframe( + self.test_df, + genome=self.genome, + create_missing_features=False, + id_column='gene_id', + type='RelativeAbundance' + ) + + self.assertEqual(len(expr.features), 3) + self.assertEqual(len(expr.conditions), 3) + self.assertEqual(expr._data.shape, (3, 3)) + + def test_from_dataframe_bulk_loading(self): + """Test that data is loaded in bulk, not row-by-row.""" + expr = MSExpression.from_dataframe( + self.test_df, + genome=self.genome, + id_column='gene_id' + ) + + # Verify all values are loaded correctly + self.assertAlmostEqual(expr.get_value('gene1', 'condition1'), 10.5) + self.assertAlmostEqual(expr.get_value('gene2', 'condition2'), 15.7) + self.assertAlmostEqual(expr.get_value('gene3', 'condition3'), 7.8) + + def test_from_dataframe_with_ignore_columns(self): + """Test loading with ignore_columns parameter.""" + df = self.test_df.copy() + df['metadata'] = ['A', 'B', 'C'] + + expr = MSExpression.from_dataframe( + df, + genome=self.genome, + ignore_columns=['metadata'], + id_column='gene_id' + ) + + self.assertEqual(len(expr.conditions), 3) + self.assertNotIn('metadata', [c.id for c in expr.conditions]) + + def test_from_dataframe_with_description_column(self): + """Test loading with description_column parameter.""" + df = self.test_df.copy() + df['description'] = ['Desc1', 'Desc2', 'Desc3'] + + expr = MSExpression.from_dataframe( + df, + genome=self.genome, + description_column='description', + id_column='gene_id' + ) + + self.assertEqual(len(expr.conditions), 3) + self.assertNotIn('description', [c.id for c in expr.conditions]) + + def test_from_dataframe_default_id_column(self): + """Test that first column is used as ID if id_column not specified.""" + expr = MSExpression.from_dataframe( + self.test_df, + genome=self.genome + ) + + self.assertEqual(len(expr.features), 3) + self.assertIn('gene1', expr.features) + + def test_from_dataframe_creates_missing_features(self): + """Test create_missing_features parameter.""" + df = pd.DataFrame({ + 'gene_id': ['gene1', 'gene2', 'gene_new'], + 'condition1': [10.5, 8.2, 5.0] + }) + + expr = MSExpression.from_dataframe( + df, + genome=self.genome, + create_missing_features=True, + id_column='gene_id' + ) + + # gene_new should be created + self.assertEqual(len(self.genome.features), 4) + + def test_from_dataframe_handles_nan_values(self): + """Test that NaN values are handled correctly.""" + df = pd.DataFrame({ + 'gene_id': ['gene1', 'gene2'], + 'condition1': [10.5, np.nan], + 'condition2': [np.nan, 15.7] + }) + + expr = MSExpression.from_dataframe( + df, + genome=self.genome, + id_column='gene_id' + ) + + # NaN should be converted to None in API + self.assertIsNone(expr.get_value('gene2', 'condition1')) + self.assertIsNone(expr.get_value('gene1', 'condition2')) + + +class TestFeatureAndConditionManagement(unittest.TestCase): + """TC-3: Test feature and condition management.""" + + def setUp(self): + """Create test expression object.""" + self.genome = MSGenome() + self.genome.features.append(MSFeature('gene1', '')) + self.genome.features.append(MSFeature('gene2', '')) + + self.expr = MSExpression('RelativeAbundance') + self.expr.object = self.genome + + def test_add_feature_creates_msexpressionfeature(self): + """Test that add_feature creates MSExpressionFeature without values dict.""" + feature = self.expr.add_feature('gene1') + + self.assertIsInstance(feature, MSExpressionFeature) + self.assertEqual(feature.id, 'gene1') + self.assertFalse(hasattr(feature, 'values')) + + def test_add_value_updates_dataframe(self): + """Test that MSExpressionFeature.add_value updates parent DataFrame.""" + condition = MSCondition('cond1', self.expr) + self.expr.conditions.append(condition) + + feature = self.expr.add_feature('gene1') + feature.add_value(condition, 10.5) + + self.assertEqual(self.expr._data.loc['gene1', 'cond1'], 10.5) + + def test_add_value_creates_row_if_missing(self): + """Test that add_value creates feature row if missing.""" + condition = MSCondition('cond1', self.expr) + self.expr.conditions.append(condition) + + feature = self.expr.add_feature('gene1') + feature.add_value(condition, 10.5) + + self.assertIn('gene1', self.expr._data.index) + + def test_add_value_creates_column_if_missing(self): + """Test that add_value creates condition column if missing.""" + feature = self.expr.add_feature('gene1') + condition = MSCondition('cond1', self.expr) + + feature.add_value(condition, 10.5) + + self.assertIn('cond1', self.expr._data.columns) + + def test_get_value_from_dataframe(self): + """Test that get_value retrieves from DataFrame.""" + condition = MSCondition('cond1', self.expr) + self.expr.conditions.append(condition) + + feature = self.expr.add_feature('gene1') + feature.add_value(condition, 10.5) + + value = feature.get_value(condition) + self.assertAlmostEqual(value, 10.5) + + def test_get_value_converts_nan_to_none(self): + """Test that get_value converts NaN to None.""" + condition = MSCondition('cond1', self.expr) + self.expr.conditions.append(condition) + + feature = self.expr.add_feature('gene1') + self.expr._data.loc['gene1', 'cond1'] = np.nan + + value = feature.get_value(condition) + self.assertIsNone(value) + + def test_get_value_with_string_condition(self): + """Test get_value with condition ID string.""" + condition = MSCondition('cond1', self.expr) + self.expr.conditions.append(condition) + + feature = self.expr.add_feature('gene1') + feature.add_value(condition, 10.5) + + value = feature.get_value('cond1') + self.assertAlmostEqual(value, 10.5) + + def test_msexpression_get_value(self): + """Test MSExpression.get_value method.""" + condition = MSCondition('cond1', self.expr) + self.expr.conditions.append(condition) + + feature = self.expr.add_feature('gene1') + feature.add_value(condition, 10.5) + + value = self.expr.get_value('gene1', 'cond1') + self.assertAlmostEqual(value, 10.5) + + +class TestStatisticalMethods(unittest.TestCase): + """TC-4: Test statistical methods using pandas operations.""" + + def setUp(self): + """Create test expression with data.""" + df = pd.DataFrame({ + 'gene_id': ['gene1', 'gene2', 'gene3', 'gene4'], + 'condition1': [10.0, 20.0, 30.0, 40.0], + 'condition2': [5.0, 10.0, 15.0, 20.0] + }) + + genome = MSGenome() + for i in range(1, 5): + genome.features.append(MSFeature(f'gene{i}', '')) + + self.expr = MSExpression.from_dataframe( + df, + genome=genome, + id_column='gene_id' + ) + + def test_condition_average_value(self): + """Test MSCondition.average_value using pandas.""" + cond1 = self.expr.conditions.get_by_id('condition1') + avg = cond1.average_value() + self.assertAlmostEqual(avg, 25.0) + + def test_condition_lowest_value(self): + """Test MSCondition.lowest_value using pandas.""" + cond1 = self.expr.conditions.get_by_id('condition1') + lowest = cond1.lowest_value() + self.assertAlmostEqual(lowest, 10.0) + + def test_condition_highest_value(self): + """Test MSCondition.highest_value using pandas.""" + cond1 = self.expr.conditions.get_by_id('condition1') + highest = cond1.highest_value() + self.assertAlmostEqual(highest, 40.0) + + def test_condition_sum_value(self): + """Test MSCondition.sum_value using pandas.""" + cond1 = self.expr.conditions.get_by_id('condition1') + total = cond1.sum_value() + self.assertAlmostEqual(total, 100.0) + + def test_condition_value_at_zscore(self): + """Test MSCondition.value_at_zscore calculation.""" + cond1 = self.expr.conditions.get_by_id('condition1') + # Mean = 25, StdDev = 11.18 (approximately) + value_at_z1 = cond1.value_at_zscore(1.0) + self.assertGreater(value_at_z1, 25.0) + self.assertLess(value_at_z1, 40.0) + + def test_statistical_methods_with_nan_values(self): + """Test that statistical methods handle NaN correctly.""" + self.expr._data.loc['gene1', 'condition1'] = np.nan + + cond1 = self.expr.conditions.get_by_id('condition1') + avg = cond1.average_value() + + # Should compute average of remaining 3 values: (20 + 30 + 40) / 3 = 30 + self.assertAlmostEqual(avg, 30.0) + + def test_statistical_methods_empty_condition(self): + """Test statistical methods with empty condition.""" + # Add empty condition + self.expr._data['empty_cond'] = np.nan + self.expr.conditions.append(MSCondition('empty_cond', self.expr)) + + cond = self.expr.conditions.get_by_id('empty_cond') + self.assertIsNone(cond.average_value()) + self.assertIsNone(cond.lowest_value()) + self.assertIsNone(cond.highest_value()) + self.assertEqual(cond.sum_value(), 0.0) + + +class TestValueRetrievalAndManipulation(unittest.TestCase): + """TC-5: Test value retrieval and manipulation.""" + + def setUp(self): + """Create test expression.""" + df = pd.DataFrame({ + 'gene_id': ['gene1', 'gene2'], + 'condition1': [10.0, 20.0], + 'condition2': [5.0, 10.0] + }) + + genome = MSGenome() + genome.features.append(MSFeature('gene1', '')) + genome.features.append(MSFeature('gene2', '')) + + self.expr = MSExpression.from_dataframe( + df, + genome=genome, + id_column='gene_id' + ) + + def test_get_value_with_feature_string(self): + """Test get_value with feature ID string.""" + value = self.expr.get_value('gene1', 'condition1') + self.assertAlmostEqual(value, 10.0) + + def test_get_value_with_condition_string(self): + """Test get_value with condition ID string.""" + feature = self.expr.features.get_by_id('gene1') + value = feature.get_value('condition1') + self.assertAlmostEqual(value, 10.0) + + def test_get_value_missing_feature(self): + """Test get_value with missing feature.""" + value = self.expr.get_value('nonexistent', 'condition1') + self.assertIsNone(value) + + def test_get_value_missing_condition(self): + """Test get_value with missing condition.""" + feature = self.expr.features.get_by_id('gene1') + value = feature.get_value('nonexistent') + self.assertIsNone(value) + + def test_add_value_multiple_times(self): + """Test that add_value can update existing values.""" + feature = self.expr.features.get_by_id('gene1') + condition = self.expr.conditions.get_by_id('condition1') + + feature.add_value(condition, 100.0) + value = feature.get_value(condition) + self.assertAlmostEqual(value, 100.0) + + +class TestGPRIntegration(unittest.TestCase): + """TC-6: Test GPR integration and reaction expression building.""" + + def setUp(self): + """Create test model and expression.""" + # Create a simple model + self.model = Model('test_model') + + # Add genes + gene1 = Gene('gene1') + gene2 = Gene('gene2') + gene3 = Gene('gene3') + self.model.genes.extend([gene1, gene2, gene3]) + + # Add reactions with GPR rules + rxn1 = Reaction('rxn1') + rxn1.gene_reaction_rule = 'gene1' + + rxn2 = Reaction('rxn2') + rxn2.gene_reaction_rule = 'gene1 and gene2' + + rxn3 = Reaction('rxn3') + rxn3.gene_reaction_rule = 'gene1 or gene2' + + self.model.add_reactions([rxn1, rxn2, rxn3]) + + # Create genome + self.genome = MSGenome() + self.genome.features.append(MSFeature('gene1', '')) + self.genome.features.append(MSFeature('gene2', '')) + self.genome.features.append(MSFeature('gene3', '')) + + # Create gene expression + df = pd.DataFrame({ + 'gene_id': ['gene1', 'gene2', 'gene3'], + 'condition1': [10.0, 20.0, 30.0] + }) + + self.gene_expr = MSExpression.from_dataframe( + df, + genome=self.genome, + id_column='gene_id' + ) + + def test_build_reaction_expression_basic(self): + """Test building reaction expression from gene expression.""" + rxn_expr = self.gene_expr.build_reaction_expression(self.model, default=0.0) + + self.assertEqual(len(rxn_expr.features), 3) + self.assertEqual(len(rxn_expr.conditions), 1) + + def test_build_reaction_expression_or_rule(self): + """Test OR rule in GPR (sum of gene values).""" + rxn_expr = self.gene_expr.build_reaction_expression(self.model, default=0.0) + + # rxn3 has 'gene1 or gene2', should be 10 + 20 = 30 + value = rxn_expr.get_value('rxn3', 'condition1') + self.assertAlmostEqual(value, 30.0) + + def test_build_reaction_expression_and_rule(self): + """Test AND rule in GPR (min of gene values).""" + rxn_expr = self.gene_expr.build_reaction_expression(self.model, default=0.0) + + # rxn2 has 'gene1 and gene2', should be min(10, 20) = 10 + value = rxn_expr.get_value('rxn2', 'condition1') + self.assertAlmostEqual(value, 10.0) + + def test_build_reaction_expression_single_gene(self): + """Test single gene GPR rule.""" + rxn_expr = self.gene_expr.build_reaction_expression(self.model, default=0.0) + + # rxn1 has 'gene1', should be 10 + value = rxn_expr.get_value('rxn1', 'condition1') + self.assertAlmostEqual(value, 10.0) + + def test_build_reaction_expression_uses_dataframe(self): + """Test that build_reaction_expression accesses _data DataFrame.""" + rxn_expr = self.gene_expr.build_reaction_expression(self.model, default=0.0) + + # Verify the reaction expression has a DataFrame + self.assertIsInstance(rxn_expr._data, pd.DataFrame) + self.assertGreater(len(rxn_expr._data), 0) + + +class TestDataExport(unittest.TestCase): + """TC-7: Test data export methods.""" + + def setUp(self): + """Create test expression.""" + df = pd.DataFrame({ + 'gene_id': ['gene1', 'gene2', 'gene3'], + 'condition1': [10.0, 20.0, 30.0], + 'condition2': [5.0, 10.0, 15.0] + }) + + genome = MSGenome() + for i in range(1, 4): + genome.features.append(MSFeature(f'gene{i}', '')) + + self.expr = MSExpression.from_dataframe( + df, + genome=genome, + id_column='gene_id' + ) + + def test_get_dataframe_default_format(self): + """Test get_dataframe returns index format by default.""" + df = self.expr.get_dataframe() + + self.assertEqual(df.index.name, 'feature_id') + self.assertIn('gene1', df.index) + self.assertIn('condition1', df.columns) + + def test_get_dataframe_reset_index(self): + """Test get_dataframe with reset_index=True.""" + df = self.expr.get_dataframe(reset_index=True) + + self.assertIn('feature_id', df.columns) + self.assertIn('condition1', df.columns) + + def test_get_dataframe_returns_copy(self): + """Test that get_dataframe returns a copy, not reference.""" + df = self.expr.get_dataframe() + df.loc['gene1', 'condition1'] = 999.0 + + # Original should be unchanged + original_value = self.expr.get_value('gene1', 'condition1') + self.assertAlmostEqual(original_value, 10.0) + + def test_get_dataframe_preserves_column_order(self): + """Test that column order is preserved.""" + df = self.expr.get_dataframe() + columns = list(df.columns) + + self.assertEqual(columns, ['condition1', 'condition2']) + + +class TestEdgeCasesAndErrorHandling(unittest.TestCase): + """TC-8: Test edge cases and error handling.""" + + def test_empty_dataframe_loading(self): + """Test loading empty DataFrame.""" + df = pd.DataFrame(columns=['gene_id', 'condition1']) + genome = MSGenome() + + expr = MSExpression.from_dataframe(df, genome=genome, id_column='gene_id') + + self.assertEqual(len(expr.features), 0) + self.assertEqual(len(expr._data), 0) + + def test_single_feature_single_condition(self): + """Test with single feature and single condition.""" + df = pd.DataFrame({ + 'gene_id': ['gene1'], + 'condition1': [10.0] + }) + + genome = MSGenome() + genome.features.append(MSFeature('gene1', '')) + + expr = MSExpression.from_dataframe(df, genome=genome, id_column='gene_id') + + self.assertEqual(len(expr.features), 1) + self.assertEqual(len(expr.conditions), 1) + self.assertAlmostEqual(expr.get_value('gene1', 'condition1'), 10.0) + + def test_all_nan_condition(self): + """Test condition with all NaN values.""" + df = pd.DataFrame({ + 'gene_id': ['gene1', 'gene2'], + 'condition1': [np.nan, np.nan] + }) + + genome = MSGenome() + genome.features.append(MSFeature('gene1', '')) + genome.features.append(MSFeature('gene2', '')) + + expr = MSExpression.from_dataframe(df, genome=genome, id_column='gene_id') + + cond = expr.conditions.get_by_id('condition1') + self.assertIsNone(cond.average_value()) + + def test_mixed_numeric_non_numeric_values(self): + """Test that non-numeric values are converted to NaN.""" + df = pd.DataFrame({ + 'gene_id': ['gene1', 'gene2'], + 'condition1': ['10.0', 'invalid'] + }) + + genome = MSGenome() + genome.features.append(MSFeature('gene1', '')) + genome.features.append(MSFeature('gene2', '')) + + expr = MSExpression.from_dataframe(df, genome=genome, id_column='gene_id') + + value1 = expr.get_value('gene1', 'condition1') + value2 = expr.get_value('gene2', 'condition1') + + self.assertAlmostEqual(value1, 10.0) + self.assertIsNone(value2) + + def test_duplicate_feature_ids(self): + """Test handling of duplicate feature IDs.""" + df = pd.DataFrame({ + 'gene_id': ['gene1', 'gene1', 'gene2'], + 'condition1': [10.0, 20.0, 30.0] + }) + + genome = MSGenome() + genome.features.append(MSFeature('gene1', '')) + genome.features.append(MSFeature('gene2', '')) + + expr = MSExpression.from_dataframe(df, genome=genome, id_column='gene_id') + + # Should handle duplicates gracefully (last value wins in pandas) + self.assertIsNotNone(expr.get_value('gene1', 'condition1')) + + def test_nonexistent_id_column(self): + """Test behavior with nonexistent id_column.""" + df = pd.DataFrame({ + 'gene_id': ['gene1'], + 'condition1': [10.0] + }) + + genome = MSGenome() + + with self.assertRaises(KeyError): + MSExpression.from_dataframe( + df, + genome=genome, + id_column='nonexistent' + ) + + def test_feature_not_in_genome(self): + """Test behavior when feature not found in genome.""" + df = pd.DataFrame({ + 'gene_id': ['gene1', 'gene_missing'], + 'condition1': [10.0, 20.0] + }) + + genome = MSGenome() + genome.features.append(MSFeature('gene1', '')) + + expr = MSExpression.from_dataframe( + df, + genome=genome, + create_missing_features=False, + id_column='gene_id' + ) + + # Only gene1 should be loaded + self.assertEqual(len(expr.features), 1) + self.assertNotIn('gene_missing', expr.features) + + +class TestTypeAnnotations(unittest.TestCase): + """TC-9: Test type annotations and signatures.""" + + def test_from_dataframe_type_annotations(self): + """Test that from_dataframe has proper type annotations.""" + import inspect + sig = inspect.signature(MSExpression.from_dataframe) + + # Check key parameters have annotations + self.assertIn('df', sig.parameters) + self.assertIn('genome', sig.parameters) + # Check return annotation exists + self.assertIsNot(sig.return_annotation, inspect.Signature.empty) + + def test_get_value_type_annotations(self): + """Test that get_value has proper type annotations.""" + import inspect + sig = inspect.signature(MSExpression.get_value) + + self.assertIn('feature', sig.parameters) + self.assertIn('condition', sig.parameters) + + def test_statistical_methods_type_annotations(self): + """Test that statistical methods have type annotations.""" + import inspect + + methods = [ + MSCondition.average_value, + MSCondition.lowest_value, + MSCondition.highest_value, + MSCondition.sum_value + ] + + for method in methods: + sig = inspect.signature(method) + # Should have return annotation + self.assertIsNotNone(sig.return_annotation) + + +class TestComputeGeneScore(unittest.TestCase): + """TC-10: Test compute_gene_score helper function.""" + + def test_compute_gene_score_single_gene(self): + """Test compute_gene_score with single gene.""" + from cobra.core.gene import parse_gpr + + gpr = parse_gpr('gene1') + values = {'gene1': 10.0} + + score = compute_gene_score(gpr, values, default=0.0) + self.assertAlmostEqual(score, 10.0) + + def test_compute_gene_score_or_operation(self): + """Test compute_gene_score with OR operation.""" + from cobra.core.gene import parse_gpr + + gpr = parse_gpr('gene1 or gene2') + values = {'gene1': 10.0, 'gene2': 20.0} + + score = compute_gene_score(gpr, values, default=0.0) + self.assertAlmostEqual(score, 30.0) # Sum + + def test_compute_gene_score_and_operation(self): + """Test compute_gene_score with AND operation.""" + from cobra.core.gene import parse_gpr + + gpr = parse_gpr('gene1 and gene2') + values = {'gene1': 10.0, 'gene2': 20.0} + + score = compute_gene_score(gpr, values, default=0.0) + self.assertAlmostEqual(score, 10.0) # Min + + def test_compute_gene_score_missing_gene_uses_default(self): + """Test compute_gene_score with missing gene uses default.""" + from cobra.core.gene import parse_gpr + + gpr = parse_gpr('gene1') + values = {} + + score = compute_gene_score(gpr, values, default=5.0) + self.assertAlmostEqual(score, 5.0) + + +class TestFitModelFluxToData(unittest.TestCase): + """Test fit_model_flux_to_data function.""" + + def setUp(self): + """Create test models and expression data for testing.""" + # Create a simple model with reactions and genes + self.model = Model('test_model') + + # Add genes + gene1 = Gene('gene1') + gene2 = Gene('gene2') + gene3 = Gene('gene3') + self.model.genes.extend([gene1, gene2, gene3]) + + # Add reactions with GPR rules + from cobra import Metabolite + met_a = Metabolite('met_a', compartment='c') + met_b = Metabolite('met_b', compartment='c') + met_c = Metabolite('met_c', compartment='c') + + rxn1 = Reaction('rxn1') + rxn1.gene_reaction_rule = 'gene1' + rxn1.add_metabolites({met_a: -1, met_b: 1}) + rxn1.bounds = (-10, 10) + + rxn2 = Reaction('rxn2') + rxn2.gene_reaction_rule = 'gene2' + rxn2.add_metabolites({met_b: -1, met_c: 1}) + rxn2.bounds = (-10, 10) + + rxn3 = Reaction('rxn3') + rxn3.gene_reaction_rule = 'gene3' + rxn3.add_metabolites({met_c: -1, met_a: 1}) + rxn3.bounds = (-10, 10) + + # Add biomass reaction + biomass = Reaction('biomass') + biomass.add_metabolites({met_b: -1}) + biomass.bounds = (0, 10) + + self.model.add_reactions([rxn1, rxn2, rxn3, biomass]) + self.model.objective = 'biomass' + + # Create genome for gene expression + self.genome = MSGenome() + self.genome.features.append(MSFeature('gene1', '')) + self.genome.features.append(MSFeature('gene2', '')) + self.genome.features.append(MSFeature('gene3', '')) + + def test_fit_model_flux_basic(self): + """Task 5.3: Test basic functionality with default thresholds.""" + # Create gene expression data with high expression for all genes + df = pd.DataFrame({ + 'gene_id': ['gene1', 'gene2', 'gene3'], + 'condition1': [0.5, 0.3, 0.2] # All above default activation threshold + }) + + expression = MSExpression.from_dataframe( + df, + genome=self.genome, + id_column='gene_id', + type='RelativeAbundance' + ) + + # Create MSModelUtil (mock basic functionality) + from modelseedpy.core.msmodelutl import MSModelUtil + model_util = MSModelUtil.get(self.model) + + # Call fit_model_flux_to_data + solution = expression.fit_model_flux_to_data( + model=model_util, + condition='condition1' + ) + + # Verify solution is optimal + self.assertEqual(solution.status, 'optimal') + self.assertIsNotNone(solution.objective_value) + + def test_fit_model_flux_invalid_thresholds(self): + """Task 5.7: Test invalid threshold validation.""" + df = pd.DataFrame({ + 'gene_id': ['gene1', 'gene2', 'gene3'], + 'condition1': [0.5, 0.3, 0.2] + }) + + expression = MSExpression.from_dataframe( + df, + genome=self.genome, + id_column='gene_id', + type='RelativeAbundance' + ) + + from modelseedpy.core.msmodelutl import MSModelUtil + model_util = MSModelUtil.get(self.model) + + # Test with activation <= deactivation + with self.assertRaises(ValueError) as context: + expression.fit_model_flux_to_data( + model=model_util, + condition='condition1', + activation_threshold=0.001, + deactivation_threshold=0.002 + ) + + self.assertIn('must be greater than', str(context.exception)) + + def test_fit_model_flux_missing_condition_error(self): + """Task 5.6: Test missing condition error.""" + df = pd.DataFrame({ + 'gene_id': ['gene1', 'gene2', 'gene3'], + 'condition1': [0.5, 0.3, 0.2] + }) + + expression = MSExpression.from_dataframe( + df, + genome=self.genome, + id_column='gene_id', + type='RelativeAbundance' + ) + + from modelseedpy.core.msmodelutl import MSModelUtil + model_util = MSModelUtil.get(self.model) + + # Test with non-existent condition + with self.assertRaises(ValueError) as context: + expression.fit_model_flux_to_data( + model=model_util, + condition='nonexistent_condition' + ) + + self.assertIn('not found in expression data', str(context.exception)) + self.assertIn('Available conditions', str(context.exception)) + + def test_fit_model_flux_genome_to_model_conversion(self): + """Task 5.4: Test genome to model conversion.""" + # Create gene expression (genome-level) + df = pd.DataFrame({ + 'gene_id': ['gene1', 'gene2', 'gene3'], + 'condition1': [0.5, 0.3, 0.2] + }) + + expression = MSExpression.from_dataframe( + df, + genome=self.genome, + id_column='gene_id', + type='RelativeAbundance' + ) + + # Verify it's genome-level + self.assertIsInstance(expression.object, MSGenome) + + from modelseedpy.core.msmodelutl import MSModelUtil + model_util = MSModelUtil.get(self.model) + + # Call fit_model_flux_to_data - should convert to reaction level + solution = expression.fit_model_flux_to_data( + model=model_util, + condition='condition1' + ) + + # Verify optimization succeeded + self.assertEqual(solution.status, 'optimal') + + def test_fit_model_flux_model_mismatch_error(self): + """Task 5.5: Test model mismatch error.""" + # Create reaction-level expression for model A + rxn_expr = MSExpression('RelativeAbundance') + rxn_expr.object = self.model + for rxn in self.model.reactions: + rxn_expr.add_feature(rxn.id) + + cond = MSCondition('condition1', rxn_expr) + rxn_expr.conditions.append(cond) + + # Add some values + for rxn in self.model.reactions: + rxn_expr.features.get_by_id(rxn.id).add_value(cond, 0.5) + + # Create a different model (Model B) + model_b = Model('test_model_b') + gene_b = Gene('gene_b') + model_b.genes.append(gene_b) + + from cobra import Metabolite + met_b = Metabolite('met_b', compartment='c') + rxn_b = Reaction('rxn_b') + rxn_b.gene_reaction_rule = 'gene_b' + rxn_b.add_metabolites({met_b: -1}) + model_b.add_reactions([rxn_b]) + + from modelseedpy.core.msmodelutl import MSModelUtil + model_util_b = MSModelUtil.get(model_b) + + # Try to fit with mismatched model + with self.assertRaises(ValueError) as context: + rxn_expr.fit_model_flux_to_data( + model=model_util_b, + condition='condition1' + ) + + self.assertIn('Models must match', str(context.exception)) + + def test_fit_model_flux_type_transformations(self): + """Task 5.8: Test type transformations to RelativeAbundance.""" + # Test FPKM transformation + df_fpkm = pd.DataFrame({ + 'gene_id': ['gene1', 'gene2', 'gene3'], + 'condition1': [100.0, 200.0, 300.0] # FPKM values + }) + + expression_fpkm = MSExpression.from_dataframe( + df_fpkm, + genome=self.genome, + id_column='gene_id', + type='FPKM' + ) + + from modelseedpy.core.msmodelutl import MSModelUtil + model_util = MSModelUtil.get(self.model) + + # Should transform FPKM to RelativeAbundance + solution = expression_fpkm.fit_model_flux_to_data( + model=model_util, + condition='condition1', + activation_threshold=0.1, # Lower threshold for transformed data + deactivation_threshold=0.01 + ) + + self.assertEqual(solution.status, 'optimal') + + # Verify original data is unchanged + self.assertEqual(expression_fpkm.type, 'FPKM') + self.assertAlmostEqual(expression_fpkm.get_value('gene1', 'condition1'), 100.0) + + def test_fit_model_flux_custom_thresholds(self): + """Task 5.9: Test custom activation/deactivation thresholds.""" + df = pd.DataFrame({ + 'gene_id': ['gene1', 'gene2', 'gene3'], + 'condition1': [0.5, 0.05, 0.001] # High, medium, low expression + }) + + expression = MSExpression.from_dataframe( + df, + genome=self.genome, + id_column='gene_id', + type='RelativeAbundance' + ) + + from modelseedpy.core.msmodelutl import MSModelUtil + model_util = MSModelUtil.get(self.model) + + # Use custom thresholds + solution = expression.fit_model_flux_to_data( + model=model_util, + condition='condition1', + activation_threshold=0.1, # gene1 should be activated + deactivation_threshold=0.01 # gene3 should be deactivated + ) + + self.assertEqual(solution.status, 'optimal') + + def test_fit_model_flux_no_activated_reactions(self): + """Task 5.10: Test error when no reactions above threshold.""" + df = pd.DataFrame({ + 'gene_id': ['gene1', 'gene2', 'gene3'], + 'condition1': [0.0001, 0.0001, 0.0001] # All very low + }) + + expression = MSExpression.from_dataframe( + df, + genome=self.genome, + id_column='gene_id', + type='RelativeAbundance' + ) + + from modelseedpy.core.msmodelutl import MSModelUtil + model_util = MSModelUtil.get(self.model) + + # Use very high threshold - no reactions should be activated + with self.assertRaises(ValueError) as context: + expression.fit_model_flux_to_data( + model=model_util, + condition='condition1', + activation_threshold=10.0 + ) + + self.assertIn('No reactions have expression values above', str(context.exception)) + + def test_fit_model_flux_all_reactions_activated(self): + """Task 5.11: Test with very low threshold - all reactions activated.""" + df = pd.DataFrame({ + 'gene_id': ['gene1', 'gene2', 'gene3'], + 'condition1': [0.5, 0.3, 0.2] + }) + + expression = MSExpression.from_dataframe( + df, + genome=self.genome, + id_column='gene_id', + type='RelativeAbundance' + ) + + from modelseedpy.core.msmodelutl import MSModelUtil + model_util = MSModelUtil.get(self.model) + + # Use very low thresholds + solution = expression.fit_model_flux_to_data( + model=model_util, + condition='condition1', + activation_threshold=0.00001, + deactivation_threshold=0.000001 + ) + + # Should still succeed with all reactions activated + self.assertEqual(solution.status, 'optimal') + + def test_fit_model_flux_missing_expression_data(self): + """Task 5.12: Test that reactions without expression data are ignored.""" + # Only provide expression for gene1 and gene2, not gene3 + df = pd.DataFrame({ + 'gene_id': ['gene1', 'gene2'], # gene3 missing + 'condition1': [0.5, 0.3] + }) + + expression = MSExpression.from_dataframe( + df, + genome=self.genome, + id_column='gene_id', + type='RelativeAbundance', + create_missing_features=False + ) + + from modelseedpy.core.msmodelutl import MSModelUtil + model_util = MSModelUtil.get(self.model) + + # Should handle missing data gracefully + solution = expression.fit_model_flux_to_data( + model=model_util, + condition='condition1' + ) + + # Optimization should still work + self.assertEqual(solution.status, 'optimal') + + def test_fit_model_flux_multiple_conditions(self): + """Task 5.14: Test multiple conditions - verify each returns different fluxes.""" + df = pd.DataFrame({ + 'gene_id': ['gene1', 'gene2', 'gene3'], + 'condition1': [0.5, 0.3, 0.2], + 'condition2': [0.2, 0.5, 0.3], + 'condition3': [0.3, 0.2, 0.5] + }) + + expression = MSExpression.from_dataframe( + df, + genome=self.genome, + id_column='gene_id', + type='RelativeAbundance' + ) + + from modelseedpy.core.msmodelutl import MSModelUtil + model_util = MSModelUtil.get(self.model) + + # Run for multiple conditions + solutions = {} + for cond in ['condition1', 'condition2', 'condition3']: + solutions[cond] = expression.fit_model_flux_to_data( + model=model_util, + condition=cond + ) + + # Verify all succeeded + for cond, sol in solutions.items(): + self.assertEqual(sol.status, 'optimal') + + # Verify original data unchanged + self.assertEqual(expression.type, 'RelativeAbundance') + self.assertAlmostEqual(expression.get_value('gene1', 'condition1'), 0.5) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_data/mock_data.py b/tests/test_data/mock_data.py index 4c86b371..478aad0e 100644 --- a/tests/test_data/mock_data.py +++ b/tests/test_data/mock_data.py @@ -271,10 +271,9 @@ def remap(model, bigg_to_seed_cpd, bigg_to_seed_rxn, index="0"): def mock_model_ecoli_core(seed=True): - from cobra.io import load_json_model - from os import path + from cobra.io import load_model - model = load_json_model(path.join(path.dirname(__file__), "e_coli_core.json")) + model = load_model("textbook") if not seed: return model bigg_to_seed_cpd = { diff --git a/tox.ini b/tox.ini new file mode 100644 index 00000000..0aa1e6aa --- /dev/null +++ b/tox.ini @@ -0,0 +1,36 @@ +[tox] +envlist = py39,py310,py311 + +[gh-actions] +python = + 3.9: py39 + 3.10: py310 + 3.11: py311 + +[testenv] +setenv = ARCHIVEINTERFACE_CPCONFIG = {toxinidir}/server.conf +deps = + build + coverage + mock + pre-commit + pytest + pytest-cov + recommonmark + setuptools +commands = pytest --cov --cov-append --cov-report=term-missing +changedir = tests + +[testenv:report] +deps = coverage +skip_install = true +commands = + coverage report -m + coverage html +changedir = tests + +[testenv:clean] +deps = coverage +skip_install = true +commands = coverage erase +changedir = tests