diff --git a/.secrets.baseline b/.secrets.baseline index e1737e6..6a81058 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -3,7 +3,7 @@ "files": "^.secrets.baseline$", "lines": null }, - "generated_at": "2026-04-09T04:39:13Z", + "generated_at": "2026-05-11T09:05:52Z", "plugins_used": [ { "name": "AWSKeyDetector" @@ -301,6 +301,66 @@ "verified_result": null } ], + "docs/chapters/05_dph_services/overview.rst": [ + { + "hashed_secret": "45d676e7c6ab44cf4b8fa366ef2d8fccd3e6d6e6", + "is_secret": false, + "is_verified": false, + "line_number": 123, + "type": "Secret Keyword", + "verified_result": null + } + ], + "docs/chapters/05_dph_services/usage_guide.rst": [ + { + "hashed_secret": "11fa7c37d697f30e6aee828b4426a10f83ab2380", + "is_secret": false, + "is_verified": false, + "line_number": 63, + "type": "Secret Keyword", + "verified_result": null + } + ], + "docs/chapters/06_odcs_generator/collibra_integration.rst": [ + { + "hashed_secret": "564e340cd48437d2dfe876ee154cc99dc4d0d137", + "is_secret": false, + "is_verified": false, + "line_number": 97, + "type": "Secret Keyword", + "verified_result": null + } + ], + "docs/chapters/06_odcs_generator/examples.rst": [ + { + "hashed_secret": "564e340cd48437d2dfe876ee154cc99dc4d0d137", + "is_secret": false, + "is_verified": false, + "line_number": 61, + "type": "Secret Keyword", + "verified_result": null + } + ], + "docs/chapters/06_odcs_generator/index.rst": [ + { + "hashed_secret": "564e340cd48437d2dfe876ee154cc99dc4d0d137", + "is_secret": false, + "is_verified": false, + "line_number": 93, + "type": "Secret Keyword", + "verified_result": null + } + ], + "docs/chapters/06_odcs_generator/informatica_integration.rst": [ + { + "hashed_secret": "564e340cd48437d2dfe876ee154cc99dc4d0d137", + "is_secret": false, + "is_verified": false, + "line_number": 77, + "type": "Secret Keyword", + "verified_result": null + } + ], "examples/auth_provider_usage.py": [ { "hashed_secret": "df5cc5832dc34a455c18662ac84587ea19cf2435", diff --git a/docs/api/data_product_recommender/index.rst b/docs/api/data_product_recommender/index.rst new file mode 100644 index 0000000..713e0f5 --- /dev/null +++ b/docs/api/data_product_recommender/index.rst @@ -0,0 +1,68 @@ +.. + Copyright 2026 IBM Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. _api_data_product_recommender: + +Data Product Recommender Reference +=================================== + +Class reference for the Data Product Recommender module. + +Core Classes +------------ + +.. currentmodule:: wxdi.data_product_recommender.recommender + +.. autoclass:: DataProductRecommender + :members: + :undoc-members: + :show-inheritance: + +Platform Parsers +---------------- + +.. currentmodule:: wxdi.data_product_recommender.platforms + +.. autoclass:: SnowflakeQueryParser + :members: + :undoc-members: + :show-inheritance: + +.. autoclass:: DatabricksQueryParser + :members: + :undoc-members: + :show-inheritance: + +.. autoclass:: BigQueryQueryParser + :members: + :undoc-members: + :show-inheritance: + +.. autoclass:: WatsonxDataQueryParser + :members: + :undoc-members: + :show-inheritance: + +Base Classes +------------ + +.. currentmodule:: wxdi.data_product_recommender.base + +.. autoclass:: QueryLogParser + :members: + :undoc-members: + :show-inheritance: + +.. Made with Bob diff --git a/docs/api/dph_services/core.rst b/docs/api/dph_services/core.rst new file mode 100644 index 0000000..aa92c32 --- /dev/null +++ b/docs/api/dph_services/core.rst @@ -0,0 +1,34 @@ +.. + Copyright 2026 IBM Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. _api_dph_services_core: + +Core Classes +============ + +Main service class and data models for Data Product Hub Services. + +DphV1 Service +------------- + +.. currentmodule:: wxdi.dph_services + +.. autoclass:: DphV1 + :members: + :undoc-members: + :show-inheritance: + :inherited-members: + +.. Made with Bob \ No newline at end of file diff --git a/docs/api/dph_services/index.rst b/docs/api/dph_services/index.rst new file mode 100644 index 0000000..c42ff65 --- /dev/null +++ b/docs/api/dph_services/index.rst @@ -0,0 +1,43 @@ +.. + Copyright 2026 IBM Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. _api_dph_services: + +DPH Services API +================ + +API reference for the Data Product Hub Services module. + +.. toctree:: + :maxdepth: 2 + + core + +Main Service Class +------------------ + +The DphV1 class provides access to all Data Product Hub Services operations. +For detailed API reference including all methods, see :ref:`api_dph_services_core`. + +.. currentmodule:: wxdi.dph_services + +.. autoclass:: DphV1 + :members: + :undoc-members: + :show-inheritance: + :special-members: __init__ + :no-index: + +.. Made with Bob \ No newline at end of file diff --git a/docs/api/index.rst b/docs/api/index.rst index bab9877..6455d0f 100644 --- a/docs/api/index.rst +++ b/docs/api/index.rst @@ -27,6 +27,9 @@ This API reference documentation is auto-generated from the source code docstrin common/index dq_validator/index + dph_services/index + odcs_generator/index + data_product_recommender/index Module Organization ------------------- @@ -50,6 +53,36 @@ In-memory data quality validation: * :ref:`REST API Providers` - IBM Cloud Pak for Data integration * :ref:`Result Consolidation` - Result aggregation and analysis +DPH Services Module +~~~~~~~~~~~~~~~~~~~ + +Data Product Hub API client: + +* :ref:`DphV1 Service` - Main service class for Data Product Hub operations +* Container, data product, draft, release, and domain management +* Contract terms and template operations +* Asset visualization + +ODCS Generator Module +~~~~~~~~~~~~~~~~~~~~~ + +Generate Open Data Contract Standard files: + +* :ref:`Collibra Integration` - CollibraClient and ODCSGenerator classes +* :ref:`Informatica Integration` - InformaticaClient and ODCSGenerator classes +* ODCS v3.1.0 compliant YAML generation +* Command-line and Python interfaces + +Data Product Recommender Module +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Query log analysis tool for data product recommendations: + +* :ref:`DataProductRecommender` - Core recommendation engine +* Platform-specific query log parsers (Snowflake, Databricks, BigQuery, watsonx.data) +* Scoring and ranking algorithms +* CLI and Python interfaces + Navigation Tips --------------- diff --git a/docs/api/odcs_generator/index.rst b/docs/api/odcs_generator/index.rst new file mode 100644 index 0000000..2b45be6 --- /dev/null +++ b/docs/api/odcs_generator/index.rst @@ -0,0 +1,48 @@ +.. + Copyright 2026 IBM Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. _api_odcs_generator: + +ODCS Generator Reference +======================== + +Class reference for the ODCS Generator module. + +Collibra Integration +-------------------- + +.. currentmodule:: wxdi.odcs_generator.generate_odcs_from_collibra + +.. autoclass:: CollibraClient + :members: + :undoc-members: + :show-inheritance: + +.. autoclass:: ODCSGenerator + :members: + :undoc-members: + :show-inheritance: + +Informatica Integration +----------------------- + +.. currentmodule:: wxdi.odcs_generator.generate_odcs_from_informatica + +.. autoclass:: InformaticaClient + :members: + :undoc-members: + :show-inheritance: + +.. Made with Bob diff --git a/docs/chapters/05_dph_services/examples.rst b/docs/chapters/05_dph_services/examples.rst new file mode 100644 index 0000000..2d277c9 --- /dev/null +++ b/docs/chapters/05_dph_services/examples.rst @@ -0,0 +1,514 @@ +.. + Copyright 2026 IBM Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. _dph_services_examples: + +Examples +======== + +Complete examples demonstrating common use cases for the Data Product Hub Services module. + +Complete Workflow Example +-------------------------- + +This example demonstrates the complete lifecycle of a data product: + +.. code-block:: python + + from wxdi.dph_services import DphV1 + from ibm_cloud_sdk_core.authenticators import IAMAuthenticator + from ibm_cloud_sdk_core import ApiException + + # Initialize service + authenticator = IAMAuthenticator('your-api-key') + dph_service = DphV1(authenticator=authenticator) + dph_service.set_service_url('https://your-dph-instance.com') + + # Step 1: Initialize container + print("Initializing container...") + init_response = dph_service.initialize( + include=['delivery_methods', 'data_product_samples', 'domains_multi_industry'] + ) + container_id = init_response.result['container']['id'] + print(f"Container initialized: {container_id}") + + # Step 2: Create a domain + print("\nCreating domain...") + domain = dph_service.create_data_product_domain( + name='Customer Analytics', + description='Customer behavior and analytics data products', + container={'id': container_id} + ) + domain_id = domain.result['id'] + print(f"Domain created: {domain_id}") + + # Step 3: Create data product with draft + print("\nCreating data product...") + data_product = dph_service.create_data_product( + drafts=[{ + 'version': '1.0.0', + 'name': 'Customer Purchase History', + 'description': 'Historical customer purchase data for analytics', + 'asset': { + 'id': 'asset-12345', + 'container': {'id': container_id} + }, + 'domain': { + 'id': domain_id, + 'name': 'Customer Analytics' + }, + 'parts_out': [{ + 'asset': { + 'id': 'asset-12345', + 'container': {'id': container_id} + }, + 'delivery_methods': [{ + 'id': 'delivery-method-001', + 'container': {'id': container_id} + }] + }] + }] + ) + + product_id = data_product.result['id'] + draft_id = data_product.result['drafts'][0]['id'] + print(f"Data product created: {product_id}") + print(f"Draft created: {draft_id}") + + # Step 4: Add contract terms + print("\nAdding contract terms...") + contract_terms = dph_service.get_data_product_draft_contract_terms( + data_product_id=product_id, + draft_id=draft_id + ) + + terms_id = contract_terms.result['id'] + + doc = dph_service.create_draft_contract_terms_document( + data_product_id=product_id, + draft_id=draft_id, + contract_terms_id=terms_id, + type='terms_and_conditions', + name='Data Usage Terms', + url='https://example.com/terms.pdf' + ) + print(f"Contract document added: {doc.result['id']}") + + # Step 5: Publish the draft + print("\nPublishing draft...") + release = dph_service.publish_data_product_draft( + data_product_id=product_id, + draft_id=draft_id + ) + release_id = release.result['id'] + print(f"Release published: {release_id}") + + # Step 6: Create a new version + print("\nCreating new version...") + new_draft = dph_service.create_data_product_draft( + data_product_id=product_id, + asset={'id': 'asset-12345', 'container': {'id': container_id}}, + version='1.1.0', + name='Customer Purchase History v1.1', + description='Enhanced with additional purchase metrics' + ) + new_draft_id = new_draft.result['id'] + print(f"New draft created: {new_draft_id}") + + print("\n✅ Complete workflow executed successfully!") + +Batch Operations Example +------------------------- + +Create multiple data products efficiently: + +.. code-block:: python + + from wxdi.dph_services import DphV1 + from ibm_cloud_sdk_core.authenticators import IAMAuthenticator + + # Initialize service + authenticator = IAMAuthenticator('your-api-key') + dph_service = DphV1(authenticator=authenticator) + dph_service.set_service_url('https://your-dph-instance.com') + + # Define multiple data products + products_to_create = [ + { + 'name': 'Customer Demographics', + 'description': 'Customer demographic information', + 'asset_id': 'asset-001' + }, + { + 'name': 'Transaction History', + 'description': 'Historical transaction records', + 'asset_id': 'asset-002' + }, + { + 'name': 'Product Catalog', + 'description': 'Complete product catalog data', + 'asset_id': 'asset-003' + } + ] + + # Create all products + created_products = [] + + for product_info in products_to_create: + try: + product = dph_service.create_data_product( + drafts=[{ + 'version': '1.0.0', + 'name': product_info['name'], + 'description': product_info['description'], + 'asset': { + 'id': product_info['asset_id'], + 'container': {'id': 'container-123'} + } + }] + ) + created_products.append(product.result) + print(f"✅ Created: {product_info['name']}") + except Exception as e: + print(f"❌ Failed to create {product_info['name']}: {e}") + + print(f"\nTotal products created: {len(created_products)}") + +Pagination Example +------------------ + +Handle large datasets with pagination: + +.. code-block:: python + + from wxdi.dph_services import DphV1 + from ibm_cloud_sdk_core.authenticators import IAMAuthenticator + + # Initialize service + authenticator = IAMAuthenticator('your-api-key') + dph_service = DphV1(authenticator=authenticator) + dph_service.set_service_url('https://your-dph-instance.com') + + # Method 1: Using pager (recommended) + print("Fetching all data products using pager...") + all_products = [] + pager = dph_service.list_data_products_with_pager(limit=50) + + for page in pager: + all_products.extend(page['data_products']) + print(f"Fetched {len(page['data_products'])} products...") + + print(f"Total products: {len(all_products)}") + + # Method 2: Manual pagination + print("\nManual pagination example...") + all_products_manual = [] + start = None + + while True: + response = dph_service.list_data_products( + limit=50, + start=start + ) + + products = response.result['data_products'] + all_products_manual.extend(products) + + # Check if there are more pages + if 'next' not in response.result or not response.result['next']: + break + + # Extract start token from next link + start = response.result['next'].get('start') + + print(f"Total products (manual): {len(all_products_manual)}") + +Error Handling Example +---------------------- + +Robust error handling for production use: + +.. code-block:: python + + from wxdi.dph_services import DphV1 + from ibm_cloud_sdk_core.authenticators import IAMAuthenticator + from ibm_cloud_sdk_core import ApiException + import time + + # Initialize service + authenticator = IAMAuthenticator('your-api-key') + dph_service = DphV1(authenticator=authenticator) + dph_service.set_service_url('https://your-dph-instance.com') + + def create_data_product_with_retry(dph_service, drafts, max_retries=3): + """Create data product with retry logic""" + for attempt in range(max_retries): + try: + response = dph_service.create_data_product(drafts=drafts) + return response + except ApiException as e: + if e.code == 429: # Rate limit + wait_time = 2 ** attempt + print(f"Rate limited. Waiting {wait_time}s before retry...") + time.sleep(wait_time) + elif e.code >= 500: # Server error + if attempt < max_retries - 1: + print(f"Server error. Retrying... (attempt {attempt + 1})") + time.sleep(2 ** attempt) + else: + raise + elif e.code == 404: + print(f"Resource not found: {e.message}") + raise + elif e.code == 401: + print("Authentication failed. Check your credentials.") + raise + elif e.code == 403: + print("Insufficient permissions.") + raise + else: + print(f"API Error {e.code}: {e.message}") + raise + + raise Exception("Max retries exceeded") + + # Use the retry function + try: + drafts = [{ + 'version': '1.0.0', + 'name': 'Test Product', + 'description': 'Test description', + 'asset': {'id': 'asset-123', 'container': {'id': 'container-456'}} + }] + + product = create_data_product_with_retry(dph_service, drafts) + print(f"✅ Product created: {product.result['id']}") + except Exception as e: + print(f"❌ Failed to create product: {e}") + +Search and Filter Example +-------------------------- + +Find specific data products: + +.. code-block:: python + + from wxdi.dph_services import DphV1 + from ibm_cloud_sdk_core.authenticators import IAMAuthenticator + + # Initialize service + authenticator = IAMAuthenticator('your-api-key') + dph_service = DphV1(authenticator=authenticator) + dph_service.set_service_url('https://your-dph-instance.com') + + # Get all products and filter + all_products = [] + pager = dph_service.list_data_products_with_pager(limit=100) + + for page in pager: + all_products.extend(page['data_products']) + + # Filter by name pattern + customer_products = [ + p for p in all_products + if 'customer' in p['name'].lower() + ] + print(f"Found {len(customer_products)} customer-related products") + + # Filter by domain + analytics_products = [ + p for p in all_products + if p.get('domain', {}).get('name') == 'Customer Analytics' + ] + print(f"Found {len(analytics_products)} analytics products") + + # Filter by version + v1_products = [ + p for p in all_products + if p['version'].startswith('1.') + ] + print(f"Found {len(v1_products)} v1.x products") + +Contract Template Example +-------------------------- + +Create and use contract templates: + +.. code-block:: python + + from wxdi.dph_services import DphV1 + from ibm_cloud_sdk_core.authenticators import IAMAuthenticator + + # Initialize service + authenticator = IAMAuthenticator('your-api-key') + dph_service = DphV1(authenticator=authenticator) + dph_service.set_service_url('https://your-dph-instance.com') + + # Create a reusable contract template + template = dph_service.create_contract_template( + name='Standard Data Sharing Agreement', + description='Standard terms for internal data sharing', + contract_terms_documents=[ + { + 'type': 'terms_and_conditions', + 'name': 'Terms and Conditions', + 'url': 'https://example.com/standard-terms.pdf' + }, + { + 'type': 'sla', + 'name': 'Service Level Agreement', + 'url': 'https://example.com/sla.pdf' + } + ] + ) + + template_id = template.result['id'] + print(f"Template created: {template_id}") + + # Use template when creating data products + data_product = dph_service.create_data_product( + drafts=[{ + 'version': '1.0.0', + 'name': 'Sales Data', + 'description': 'Monthly sales data', + 'asset': {'id': 'asset-123', 'container': {'id': 'container-456'}}, + 'contract_terms': { + 'template_id': template_id + } + }] + ) + + print(f"Data product created with template: {data_product.result['id']}") + +Domain Hierarchy Example +------------------------ + +Create and manage domain hierarchies: + +.. code-block:: python + + from wxdi.dph_services import DphV1 + from ibm_cloud_sdk_core.authenticators import IAMAuthenticator + + # Initialize service + authenticator = IAMAuthenticator('your-api-key') + dph_service = DphV1(authenticator=authenticator) + dph_service.set_service_url('https://your-dph-instance.com') + + # Create parent domain + parent_domain = dph_service.create_data_product_domain( + name='Customer Data', + description='All customer-related data products', + container={'id': 'container-123'} + ) + parent_id = parent_domain.result['id'] + print(f"Parent domain created: {parent_id}") + + # Create subdomains + subdomains = [ + {'name': 'Demographics', 'description': 'Customer demographic data'}, + {'name': 'Behavior', 'description': 'Customer behavior analytics'}, + {'name': 'Transactions', 'description': 'Customer transaction history'} + ] + + for subdomain_info in subdomains: + subdomain = dph_service.create_data_product_subdomain( + domain_id=parent_id, + name=subdomain_info['name'], + description=subdomain_info['description'] + ) + print(f"Subdomain created: {subdomain_info['name']}") + + # List all domains with hierarchy + domains = dph_service.list_data_product_domains(limit=100) + + for domain in domains.result['domains']: + print(f"\n{domain['name']}") + if 'subdomains' in domain: + for subdomain in domain['subdomains']: + print(f" └─ {subdomain['name']}") + +Monitoring and Reporting Example +--------------------------------- + +Generate reports on data product usage: + +.. code-block:: python + + from wxdi.dph_services import DphV1 + from ibm_cloud_sdk_core.authenticators import IAMAuthenticator + from collections import defaultdict + from datetime import datetime + + # Initialize service + authenticator = IAMAuthenticator('your-api-key') + dph_service = DphV1(authenticator=authenticator) + dph_service.set_service_url('https://your-dph-instance.com') + + # Collect all data products + all_products = [] + pager = dph_service.list_data_products_with_pager(limit=100) + + for page in pager: + all_products.extend(page['data_products']) + + # Generate statistics + stats = { + 'total_products': len(all_products), + 'by_domain': defaultdict(int), + 'by_version': defaultdict(int), + 'by_state': defaultdict(int) + } + + for product in all_products: + # Count by domain + domain_name = product.get('domain', {}).get('name', 'Unknown') + stats['by_domain'][domain_name] += 1 + + # Count by version + version = product.get('version', 'Unknown') + major_version = version.split('.')[0] if '.' in version else version + stats['by_version'][f"v{major_version}.x"] += 1 + + # Count by state + state = product.get('state', 'Unknown') + stats['by_state'][state] += 1 + + # Print report + print("=" * 50) + print("DATA PRODUCT REPORT") + print("=" * 50) + print(f"\nTotal Data Products: {stats['total_products']}") + + print("\nBy Domain:") + for domain, count in sorted(stats['by_domain'].items()): + print(f" {domain}: {count}") + + print("\nBy Version:") + for version, count in sorted(stats['by_version'].items()): + print(f" {version}: {count}") + + print("\nBy State:") + for state, count in sorted(stats['by_state'].items()): + print(f" {state}: {count}") + +See Also +-------- + +- :ref:`dph_services_usage` - Detailed usage guide +- :ref:`api_dph_services` - API reference +- :ref:`dph_services_overview` - Architecture overview + +.. Made with Bob diff --git a/docs/chapters/05_dph_services/index.rst b/docs/chapters/05_dph_services/index.rst new file mode 100644 index 0000000..59a1455 --- /dev/null +++ b/docs/chapters/05_dph_services/index.rst @@ -0,0 +1,115 @@ +.. + Copyright 2026 IBM Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. _dph_services: + +Data Product Hub Services +========================== + +Python client library for IBM Data Product Hub API, providing programmatic access to data product management, container operations, contract terms, and asset visualization. + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + overview + usage_guide + examples + +Overview +-------- + +The ``dph_services`` module provides a complete Python SDK for interacting with IBM Data Product Hub services. It enables developers to programmatically manage the entire data product lifecycle, from initialization to publication and retirement. + +Key Features +------------ + +**Container Management** + Initialize and configure data product containers with delivery methods, samples, and domain structures. + +**Data Product Lifecycle** + Create, update, publish, and retire data products with full version control and draft management. + +**Contract Terms** + Manage contract terms, documents, and templates for data product agreements. + +**Asset Visualization** + Create and manage data asset visualizations for better data discovery. + +**Domain Organization** + Organize data products into domains and subdomains for better categorization. + +**Release Management** + Handle data product releases with versioning and retirement capabilities. + +Quick Start +----------- + +.. code-block:: python + + from wxdi.dph_services import DphV1 + from ibm_cloud_sdk_core.authenticators import IAMAuthenticator + + # Initialize authenticator + authenticator = IAMAuthenticator('your-api-key') + + # Create service instance + dph_service = DphV1(authenticator=authenticator) + dph_service.set_service_url('https://your-dph-instance.com') + + # Initialize container + response = dph_service.initialize( + include=['delivery_methods', 'data_product_samples', 'domains_multi_industry'] + ) + + # Create a data product + data_product = dph_service.create_data_product( + drafts=[{ + 'version': '1.0.0', + 'name': 'Customer Analytics Data Product', + 'description': 'Comprehensive customer analytics dataset', + 'asset': { + 'id': 'asset-123', + 'container': {'id': 'container-456'} + } + }] + ) + +Use Cases +--------- + +**Data Product Onboarding** + Automate the creation and configuration of new data products in your data marketplace. + +**Lifecycle Automation** + Build workflows that automatically promote drafts to releases based on quality checks. + +**Contract Management** + Programmatically manage data sharing agreements and terms of use. + +**Catalog Integration** + Integrate with data catalogs to automatically create data products from existing assets. + +**Governance Workflows** + Implement approval workflows and governance policies for data product publication. + +Next Steps +---------- + +- :ref:`dph_services_usage` - Detailed usage guide with examples +- :ref:`dph_services_examples` - Practical code examples +- :ref:`api_dph_services` - Complete API reference + +.. Made with Bob diff --git a/docs/chapters/05_dph_services/overview.rst b/docs/chapters/05_dph_services/overview.rst new file mode 100644 index 0000000..ba2042e --- /dev/null +++ b/docs/chapters/05_dph_services/overview.rst @@ -0,0 +1,266 @@ +.. + Copyright 2026 IBM Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. _dph_services_overview: + +DPH Services Overview +===================== + +The Data Product Hub Services module provides a comprehensive Python SDK for managing data products in IBM Data Product Hub. + +Architecture +------------ + +The module is built on top of the IBM Cloud SDK Core and provides: + +- **Type-safe API**: Full type hints for better IDE support +- **Error handling**: Comprehensive exception handling with detailed error messages +- **Pagination support**: Built-in pagination for large result sets +- **Authentication**: Seamless integration with IBM Cloud authentication + +Core Components +--------------- + +Container Management +~~~~~~~~~~~~~~~~~~~~ + +Containers are the foundation of Data Product Hub, providing: + +- Delivery method configurations +- Sample data products +- Domain structures +- Service credentials + +Data Products +~~~~~~~~~~~~~ + +Data products represent packaged data assets with: + +- Metadata and descriptions +- Version control +- Asset references +- Domain associations +- Contract terms + +Drafts and Releases +~~~~~~~~~~~~~~~~~~~ + +**Drafts**: Work-in-progress versions that can be edited and updated + +**Releases**: Published versions that are immutable and available for consumption + +Contract Terms +~~~~~~~~~~~~~~ + +Legal and business terms governing data product usage: + +- Terms and conditions documents +- Service level agreements +- Usage restrictions +- Compliance requirements + +Domains +~~~~~~~ + +Organizational structure for categorizing data products: + +- Top-level domains (e.g., Finance, Marketing) +- Subdomains for finer categorization +- Multi-industry domain support + +Data Flow +--------- + +1. **Initialize Container**: Set up the data product hub environment +2. **Create Draft**: Define a new data product version +3. **Add Contract Terms**: Attach legal and business terms +4. **Publish Draft**: Convert draft to a release +5. **Manage Lifecycle**: Update, version, or retire releases + +Authentication +-------------- + +The module supports multiple authentication methods: + +**IAM Authentication** (Recommended) + +.. code-block:: python + + from ibm_cloud_sdk_core.authenticators import IAMAuthenticator + + authenticator = IAMAuthenticator('your-api-key') + dph_service = DphV1(authenticator=authenticator) + +**Bearer Token Authentication** + +.. code-block:: python + + from ibm_cloud_sdk_core.authenticators import BearerTokenAuthenticator + + authenticator = BearerTokenAuthenticator('your-bearer-token') + dph_service = DphV1(authenticator=authenticator) + +**Cloud Pak for Data Authentication** + +.. code-block:: python + + from ibm_cloud_sdk_core.authenticators import CloudPakForDataAuthenticator + + authenticator = CloudPakForDataAuthenticator( + username='your-username', + password='your-password', + url='https://your-cpd-instance.com' + ) + dph_service = DphV1(authenticator=authenticator) + +Error Handling +-------------- + +The SDK uses standard IBM Cloud SDK exceptions: + +.. code-block:: python + + from ibm_cloud_sdk_core import ApiException + + try: + response = dph_service.get_data_product(data_product_id='invalid-id') + except ApiException as e: + print(f"Error Code: {e.code}") + print(f"Error Message: {e.message}") + print(f"HTTP Status: {e.http_response.status_code}") + +Common error codes: + +- **400**: Bad Request - Invalid parameters +- **401**: Unauthorized - Authentication failed +- **403**: Forbidden - Insufficient permissions +- **404**: Not Found - Resource doesn't exist +- **409**: Conflict - Resource already exists +- **500**: Internal Server Error - Service error + +Best Practices +-------------- + +**Use Pagination for Large Datasets** + +.. code-block:: python + + # Use pager for automatic pagination + pager = dph_service.list_data_products_with_pager(limit=50) + for page in pager: + for product in page['data_products']: + process_product(product) + +**Implement Retry Logic** + +.. code-block:: python + + import time + from ibm_cloud_sdk_core import ApiException + + def create_with_retry(dph_service, drafts, max_retries=3): + for attempt in range(max_retries): + try: + return dph_service.create_data_product(drafts=drafts) + except ApiException as e: + if e.code == 429 or e.code >= 500: # Rate limit or server error + if attempt < max_retries - 1: + time.sleep(2 ** attempt) # Exponential backoff + continue + raise + +**Validate Before Publishing** + +.. code-block:: python + + def validate_draft(draft): + """Validate draft before publishing""" + required_fields = ['name', 'version', 'asset', 'domain'] + for field in required_fields: + if field not in draft or not draft[field]: + raise ValueError(f"Missing required field: {field}") + return True + +**Use JSON Patch for Updates** + +.. code-block:: python + + # Efficient updates using JSON Patch + patch_operations = [ + {'op': 'replace', 'path': '/description', 'value': 'Updated description'}, + {'op': 'add', 'path': '/tags/-', 'value': 'new-tag'} + ] + + dph_service.update_data_product( + data_product_id=product_id, + json_patch_instructions=patch_operations + ) + +Performance Considerations +-------------------------- + +**Batch Operations** + +When creating multiple data products, consider batching to reduce API calls: + +.. code-block:: python + + # Create multiple drafts in a single data product + dph_service.create_data_product( + drafts=[draft1, draft2, draft3] + ) + +**Caching** + +Cache frequently accessed data to reduce API calls: + +.. code-block:: python + + from functools import lru_cache + + @lru_cache(maxsize=100) + def get_domain_cached(domain_id): + return dph_service.get_domain(domain_id=domain_id) + +**Parallel Processing** + +Use concurrent requests for independent operations: + +.. code-block:: python + + from concurrent.futures import ThreadPoolExecutor + + def get_product(product_id): + return dph_service.get_data_product(data_product_id=product_id) + + with ThreadPoolExecutor(max_workers=5) as executor: + products = list(executor.map(get_product, product_ids)) + +Requirements +------------ + +- Python 3.8 or higher +- ibm-cloud-sdk-core >= 3.16.7 +- requests >= 2.32.4 +- python-dateutil >= 2.5.3 + +See Also +-------- + +- :ref:`dph_services_usage` - Detailed usage guide +- :ref:`dph_services_examples` - Code examples +- :ref:`api_dph_services` - API reference + +.. Made with Bob diff --git a/docs/chapters/05_dph_services/usage_guide.rst b/docs/chapters/05_dph_services/usage_guide.rst new file mode 100644 index 0000000..6e4fbc0 --- /dev/null +++ b/docs/chapters/05_dph_services/usage_guide.rst @@ -0,0 +1,613 @@ +.. + Copyright 2026 IBM Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. _dph_services_usage: + +Usage Guide +=========== + +This guide provides detailed instructions for using the Data Product Hub Services module. + +Installation +------------ + +Install the data-intelligence-sdk package: + +.. code-block:: bash + + pip install -e . + +Or install from PyPI (when available): + +.. code-block:: bash + + pip install data-intelligence-sdk + +Setup and Configuration +----------------------- + +Initialize the Service +~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from wxdi.dph_services import DphV1 + from ibm_cloud_sdk_core.authenticators import IAMAuthenticator + + # Create authenticator + authenticator = IAMAuthenticator('your-api-key') + + # Initialize service + dph_service = DphV1(authenticator=authenticator) + dph_service.set_service_url('https://your-dph-instance.com') + +Environment Variables +~~~~~~~~~~~~~~~~~~~~~ + +You can also configure using environment variables: + +.. code-block:: bash + + export DPH_APIKEY=your-api-key + export DPH_URL=https://your-dph-instance.com + +.. code-block:: python + + from wxdi.dph_services import DphV1 + + # Automatically uses environment variables + dph_service = DphV1.new_instance() + +Container Operations +-------------------- + +Initialize Container +~~~~~~~~~~~~~~~~~~~~ + +Initialize a new container with default settings: + +.. code-block:: python + + response = dph_service.initialize( + include=[ + 'delivery_methods', + 'data_product_samples', + 'domains_multi_industry' + ] + ) + + print(f"Container ID: {response.result['container']['id']}") + +Check Initialization Status +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + status = dph_service.get_initialize_status() + + if status.result['status'] == 'SUCCEEDED': + print("Container is ready") + elif status.result['status'] == 'IN_PROGRESS': + print("Initialization in progress") + else: + print(f"Status: {status.result['status']}") + +Get Service Credentials +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + credentials = dph_service.get_service_id_credentials() + print(f"Service ID: {credentials.result['service_id']}") + +Data Product Management +----------------------- + +Create a Data Product +~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + data_product = dph_service.create_data_product( + drafts=[{ + 'version': '1.0.0', + 'name': 'Customer Analytics Dataset', + 'description': 'Comprehensive customer behavior analytics', + 'asset': { + 'id': 'asset-123', + 'container': {'id': 'container-456'} + }, + 'domain': { + 'id': 'domain-789', + 'name': 'Customer Analytics' + }, + 'parts_out': [{ + 'asset': { + 'id': 'asset-123', + 'container': {'id': 'container-456'} + }, + 'delivery_methods': [{ + 'id': 'method-001', + 'container': {'id': 'container-456'} + }] + }] + }] + ) + + product_id = data_product.result['id'] + print(f"Created data product: {product_id}") + +List Data Products +~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # List with pagination + response = dph_service.list_data_products(limit=50) + + for product in response.result['data_products']: + print(f"- {product['name']} (v{product['version']})") + + # Use pager for all results + all_products = [] + pager = dph_service.list_data_products_with_pager(limit=50) + + for page in pager: + all_products.extend(page['data_products']) + +Get Data Product Details +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + product = dph_service.get_data_product(data_product_id=product_id) + + print(f"Name: {product.result['name']}") + print(f"Version: {product.result['version']}") + print(f"Description: {product.result['description']}") + print(f"Status: {product.result['state']}") + +Update Data Product +~~~~~~~~~~~~~~~~~~~ + +Use JSON Patch operations for updates: + +.. code-block:: python + + updated = dph_service.update_data_product( + data_product_id=product_id, + json_patch_instructions=[ + { + 'op': 'replace', + 'path': '/description', + 'value': 'Updated comprehensive customer analytics' + }, + { + 'op': 'add', + 'path': '/tags/-', + 'value': 'analytics' + } + ] + ) + +Delete Data Product +~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + dph_service.delete_data_product(data_product_id=product_id) + print("Data product deleted") + +Draft Management +---------------- + +Create a Draft +~~~~~~~~~~~~~~ + +.. code-block:: python + + draft = dph_service.create_data_product_draft( + data_product_id=product_id, + asset={'id': 'asset-123', 'container': {'id': 'container-456'}}, + version='1.1.0', + name='Customer Analytics Dataset v1.1', + description='Enhanced version with additional metrics' + ) + + draft_id = draft.result['id'] + +List Drafts +~~~~~~~~~~~ + +.. code-block:: python + + drafts = dph_service.list_data_product_drafts( + data_product_id=product_id, + limit=50 + ) + + for draft in drafts.result['drafts']: + print(f"- Draft {draft['version']}: {draft['state']}") + +Update Draft +~~~~~~~~~~~~ + +.. code-block:: python + + updated_draft = dph_service.update_data_product_draft( + data_product_id=product_id, + draft_id=draft_id, + json_patch_instructions=[ + { + 'op': 'replace', + 'path': '/description', + 'value': 'Updated draft description' + } + ] + ) + +Publish Draft +~~~~~~~~~~~~~ + +.. code-block:: python + + # Publish draft to create a release + release = dph_service.publish_data_product_draft( + data_product_id=product_id, + draft_id=draft_id + ) + + print(f"Published release: {release.result['id']}") + +Delete Draft +~~~~~~~~~~~~ + +.. code-block:: python + + dph_service.delete_data_product_draft( + data_product_id=product_id, + draft_id=draft_id + ) + +Contract Terms Management +------------------------- + +Create Contract Terms Document +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + doc = dph_service.create_draft_contract_terms_document( + data_product_id=product_id, + draft_id=draft_id, + contract_terms_id=terms_id, + type='terms_and_conditions', + name='Terms and Conditions', + url='https://example.com/terms.pdf', + attachment={ + 'id': 'attachment-123' + } + ) + +Get Contract Terms +~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + terms = dph_service.get_data_product_draft_contract_terms( + data_product_id=product_id, + draft_id=draft_id + ) + + for doc in terms.result['documents']: + print(f"- {doc['name']}: {doc['type']}") + +Update Contract Terms Document +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + updated_doc = dph_service.update_draft_contract_terms_document( + data_product_id=product_id, + draft_id=draft_id, + contract_terms_id=terms_id, + document_id=doc_id, + json_patch_instructions=[ + { + 'op': 'replace', + 'path': '/url', + 'value': 'https://example.com/updated-terms.pdf' + } + ] + ) + +Delete Contract Terms Document +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + dph_service.delete_draft_contract_terms_document( + data_product_id=product_id, + draft_id=draft_id, + contract_terms_id=terms_id, + document_id=doc_id + ) + +Release Management +------------------ + +List Releases +~~~~~~~~~~~~~ + +.. code-block:: python + + releases = dph_service.list_data_product_releases( + data_product_id=product_id, + state=['available', 'retired'] + ) + + for release in releases.result['releases']: + print(f"- v{release['version']}: {release['state']}") + +Get Release Details +~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + release = dph_service.get_data_product_release( + data_product_id=product_id, + release_id=release_id + ) + + print(f"Version: {release.result['version']}") + print(f"State: {release.result['state']}") + print(f"Published: {release.result['created_at']}") + +Update Release +~~~~~~~~~~~~~~ + +.. code-block:: python + + updated_release = dph_service.update_data_product_release( + data_product_id=product_id, + release_id=release_id, + json_patch_instructions=[ + { + 'op': 'replace', + 'path': '/description', + 'value': 'Updated release description' + } + ] + ) + +Retire Release +~~~~~~~~~~~~~~ + +.. code-block:: python + + retired = dph_service.retire_data_product_release( + data_product_id=product_id, + release_id=release_id + ) + + print(f"Release retired: {retired.result['state']}") + +Domain Management +----------------- + +List Domains +~~~~~~~~~~~~ + +.. code-block:: python + + domains = dph_service.list_data_product_domains(limit=50) + + for domain in domains.result['domains']: + print(f"- {domain['name']}: {domain['description']}") + +Create Domain +~~~~~~~~~~~~~ + +.. code-block:: python + + domain = dph_service.create_data_product_domain( + name='Customer Analytics', + description='Customer-related data products and analytics', + container={'id': 'container-123'} + ) + + domain_id = domain.result['id'] + +Create Subdomain +~~~~~~~~~~~~~~~~ + +.. code-block:: python + + subdomain = dph_service.create_data_product_subdomain( + domain_id=domain_id, + name='Customer Segmentation', + description='Customer segmentation and clustering datasets' + ) + +Get Domain +~~~~~~~~~~ + +.. code-block:: python + + domain = dph_service.get_domain(domain_id=domain_id) + + print(f"Name: {domain.result['name']}") + print(f"Subdomains: {len(domain.result.get('subdomains', []))}") + +Update Domain +~~~~~~~~~~~~~ + +.. code-block:: python + + updated_domain = dph_service.update_data_product_domain( + domain_id=domain_id, + json_patch_instructions=[ + { + 'op': 'replace', + 'path': '/description', + 'value': 'Updated domain description' + } + ] + ) + +Delete Domain +~~~~~~~~~~~~~ + +.. code-block:: python + + dph_service.delete_domain(domain_id=domain_id) + +Asset Visualization +------------------- + +Create Visualization +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + visualization = dph_service.create_data_asset_visualization( + container={'id': 'container-123'}, + assets=[ + {'id': 'asset-1', 'container': {'id': 'container-123'}}, + {'id': 'asset-2', 'container': {'id': 'container-123'}}, + {'id': 'asset-3', 'container': {'id': 'container-123'}} + ] + ) + + print(f"Visualization created: {visualization.result['id']}") + +Reinitiate Visualization +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + reinitiated = dph_service.reinitiate_data_asset_visualization( + container={'id': 'container-123'}, + assets=[ + {'id': 'asset-1', 'container': {'id': 'container-123'}}, + {'id': 'asset-4', 'container': {'id': 'container-123'}} + ] + ) + +Contract Templates +------------------ + +Create Template +~~~~~~~~~~~~~~~ + +.. code-block:: python + + template = dph_service.create_contract_template( + name='Standard Data Sharing Agreement', + description='Standard terms for data product sharing', + contract_terms_documents=[{ + 'type': 'terms_and_conditions', + 'name': 'Standard Terms', + 'url': 'https://example.com/standard-terms.pdf' + }] + ) + + template_id = template.result['id'] + +List Templates +~~~~~~~~~~~~~~ + +.. code-block:: python + + templates = dph_service.list_data_product_contract_template(limit=50) + + for template in templates.result['contract_templates']: + print(f"- {template['name']}") + +Get Template +~~~~~~~~~~~~ + +.. code-block:: python + + template = dph_service.get_contract_template( + contract_template_id=template_id + ) + +Update Template +~~~~~~~~~~~~~~~ + +.. code-block:: python + + updated_template = dph_service.update_data_product_contract_template( + contract_template_id=template_id, + json_patch_instructions=[ + { + 'op': 'replace', + 'path': '/description', + 'value': 'Updated template description' + } + ] + ) + +Delete Template +~~~~~~~~~~~~~~~ + +.. code-block:: python + + dph_service.delete_data_product_contract_template( + contract_template_id=template_id + ) + +Advanced Topics +--------------- + +Custom Headers +~~~~~~~~~~~~~~ + +Add custom headers to requests: + +.. code-block:: python + + response = dph_service.get_data_product( + data_product_id=product_id, + headers={ + 'Custom-Header': 'value', + 'X-Request-ID': 'unique-id' + } + ) + +Timeout Configuration +~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + dph_service.set_http_config({ + 'timeout': 60 # 60 seconds + }) + +Disable SSL Verification (Development Only) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + dph_service.set_disable_ssl_verification(True) + +See Also +-------- + +- :ref:`dph_services_examples` - Complete code examples +- :ref:`api_dph_services` - API reference +- :ref:`dph_services_overview` - Architecture overview + diff --git a/docs/chapters/06_odcs_generator/collibra_integration.rst b/docs/chapters/06_odcs_generator/collibra_integration.rst new file mode 100644 index 0000000..d5fe3fa --- /dev/null +++ b/docs/chapters/06_odcs_generator/collibra_integration.rst @@ -0,0 +1,113 @@ +.. + Copyright 2026 IBM Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. _odcs_generator_collibra: + +Collibra Integration +==================== + +Generate ODCS files from Collibra data catalog assets. + +Overview +-------- + +The Collibra integration extracts metadata from Collibra assets and generates ODCS v3.1.0 compliant YAML files. + +Features +-------- + +- ✅ Automatic metadata extraction via REST API +- ✅ Column discovery through asset relations +- ✅ Data type mapping (logical and physical) +- ✅ Classification support via GraphQL API +- ✅ Tag integration at asset and column levels +- ✅ Custom attribute preservation + +Installation +------------ + +.. code-block:: bash + + pip install -e . + +Configuration +------------- + +Environment Variables +~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + export COLLIBRA_URL="https://your-instance.collibra.com" + export COLLIBRA_USERNAME="your_username" + export COLLIBRA_PASSWORD="your_password" + +Required Permissions +~~~~~~~~~~~~~~~~~~~~ + +- Read access to assets +- Read access to attributes +- Read access to relations +- Access to GraphQL API +- Read access to tags + +Usage +----- + +Command Line +~~~~~~~~~~~~ + +.. code-block:: bash + + python -m wxdi.odcs_generator.generate_odcs_from_collibra + +With options: + +.. code-block:: bash + + python -m wxdi.odcs_generator.generate_odcs_from_collibra \ + --output my-contract.yaml \ + --url https://collibra.com \ + --username myuser \ + --password mypass + +Python API +~~~~~~~~~~ + +.. code-block:: python + + from wxdi.odcs_generator.generate_odcs_from_collibra import CollibraClient, ODCSGenerator + + # Initialize client + client = CollibraClient( + base_url="https://your-instance.collibra.com", + username="your_username", + password="your_password" + ) + + # Create generator + generator = ODCSGenerator(client) + + # Generate ODCS + odcs_data = generator.generate_odcs("asset-id") + + # Save to file + generator.save_to_yaml(odcs_data, "output.yaml") + +See Also +-------- + +- :ref:`odcs_generator_examples` - Complete examples +- :ref:`api_odcs_generator` - API reference diff --git a/docs/chapters/06_odcs_generator/examples.rst b/docs/chapters/06_odcs_generator/examples.rst new file mode 100644 index 0000000..37c6f58 --- /dev/null +++ b/docs/chapters/06_odcs_generator/examples.rst @@ -0,0 +1,98 @@ +.. + Copyright 2026 IBM Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. _odcs_generator_examples: + +Examples +======== + +Complete examples for ODCS Generator module. + +Collibra Example +---------------- + +.. code-block:: python + + from wxdi.odcs_generator.generate_odcs_from_collibra import CollibraClient, ODCSGenerator + + # Initialize client + client = CollibraClient( + base_url="https://your-instance.collibra.com", + username="your_username", + password="your_password" + ) + + # Create generator + generator = ODCSGenerator(client) + + # Generate ODCS + odcs_data = generator.generate_odcs("019a57f9-62d2-7aa0-9f22-4fa2cea1180b") + + # Customize + odcs_data['dataProduct'] = 'Customer Data Product' + odcs_data['version'] = '2.0.0' + + # Save to file + generator.save_to_yaml(odcs_data, "customer-data-odcs.yaml") + +Informatica Example +------------------- + +.. code-block:: python + + from wxdi.odcs_generator.generate_odcs_from_informatica import InformaticaClient, ODCSGenerator + + # Initialize client + client = InformaticaClient( + base_url="https://your-informatica-instance.com", + username="your_username", + password="your_password" + ) + + # Create generator + generator = ODCSGenerator(client) + + # Generate ODCS + odcs_data = generator.generate_odcs("asset-id-123") + + # Save to file + generator.save_to_yaml(odcs_data, "output.yaml") + +Batch Processing +---------------- + +.. code-block:: python + + from wxdi.odcs_generator.generate_odcs_from_collibra import CollibraClient, ODCSGenerator + + client = CollibraClient(base_url, username, password) + generator = ODCSGenerator(client) + + asset_ids = ['id1', 'id2', 'id3'] + + for asset_id in asset_ids: + try: + odcs_data = generator.generate_odcs(asset_id) + generator.save_to_yaml(odcs_data, f"{asset_id}-odcs.yaml") + print(f"✅ Generated ODCS for {asset_id}") + except Exception as e: + print(f"❌ Failed for {asset_id}: {e}") + +See Also +-------- + +- :ref:`odcs_generator_collibra` - Collibra integration +- :ref:`odcs_generator_informatica` - Informatica integration +- :ref:`api_odcs_generator` - API reference diff --git a/docs/chapters/06_odcs_generator/index.rst b/docs/chapters/06_odcs_generator/index.rst new file mode 100644 index 0000000..31b1ce5 --- /dev/null +++ b/docs/chapters/06_odcs_generator/index.rst @@ -0,0 +1,189 @@ +.. + Copyright 2026 IBM Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. _odcs_generator: + +ODCS Generator +============== + +Tools to automatically generate ODCS (Open Data Contract Standard) v3.1.0 compliant YAML files from data catalog metadata. + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + overview + collibra_integration + informatica_integration + examples + +Overview +-------- + +The ``odcs_generator`` module provides automated generation of Open Data Contract Standard (ODCS) files from enterprise data catalogs. It extracts metadata from catalog systems and transforms it into standardized data contracts. + +Key Features +------------ + +**Multi-Catalog Support** + Generate ODCS files from Collibra and Informatica CDGC data catalogs. + +**Automatic Metadata Extraction** + Fetch asset details, attributes, relations, and classifications automatically. + +**Column Discovery** + Automatically discover and document table columns through catalog relations. + +**Data Type Mapping** + Intelligent mapping of catalog data types to ODCS standard types. + +**Classification Support** + Extract and include data classifications and sensitivity labels. + +**ODCS v3.1.0 Compliance** + Generate fully compliant ODCS YAML files ready for use. + +Quick Start +----------- + +**Collibra Integration** + +.. code-block:: python + + from wxdi.odcs_generator.generate_odcs_from_collibra import CollibraClient, ODCSGenerator + + # Initialize client + client = CollibraClient( + base_url="https://your-instance.collibra.com", + username="your_username", + password="your_password" + ) + + # Create generator + generator = ODCSGenerator(client) + + # Generate ODCS + odcs_data = generator.generate_odcs("asset-id") + + # Save to file + generator.save_to_yaml(odcs_data, "output.yaml") + +**Informatica Integration** + +.. code-block:: python + + from wxdi.odcs_generator.generate_odcs_from_informatica import InformaticaClient, ODCSGenerator + + # Initialize client + client = InformaticaClient( + base_url="https://your-informatica-instance.com", + username="your_username", + password="your_password" + ) + + # Create generator + generator = ODCSGenerator(client) + + # Generate ODCS + odcs_data = generator.generate_odcs("asset-id") + +Use Cases +--------- + +**Data Contract Automation** + Automatically generate data contracts from existing catalog metadata. + +**Catalog Migration** + Export catalog metadata to standardized ODCS format for migration. + +**Documentation Generation** + Create comprehensive data documentation from catalog assets. + +**Compliance Reporting** + Generate standardized contracts for compliance and governance. + +**Data Product Onboarding** + Accelerate data product creation with automated contract generation. + +Supported Catalogs +------------------ + +**Collibra** + - Asset metadata extraction + - Column discovery via relations + - Data classifications via GraphQL + - Tag integration + - Custom attributes + +**Informatica CDGC** + - Asset metadata extraction + - Column schema discovery + - System attributes + - Technical metadata + - Business glossary terms + +What is ODCS? +------------- + +The Open Data Contract Standard (ODCS) is an open-source specification for defining data contracts. It provides: + +- **Standardized Format**: Common structure for data contracts across organizations +- **Schema Definition**: Detailed column-level metadata and constraints +- **Quality Rules**: Data quality expectations and validation rules +- **Service Level Agreements**: Performance and availability commitments +- **Governance**: Data ownership, stewardship, and compliance information + +ODCS v3.1.0 Structure +~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: yaml + + id: unique-contract-id + kind: DataContract + apiVersion: v3.1.0 + domain: domain-name + dataProduct: product-name + version: 1.0.0 + name: contract-name + status: active + description: + authoritativeDefinitions: + - type: source-system + url: source-url + schema: + - id: table-id + name: table-name + columns: + - id: column-id + name: column-name + logicalType: string + physicalType: VARCHAR(255) + description: column description + classification: PII + quality: + - id: rule-id + name: rule-name + type: completeness + column: column-name + +Next Steps +---------- + +- :ref:`odcs_generator_collibra` - Collibra integration guide +- :ref:`odcs_generator_informatica` - Informatica integration guide +- :ref:`odcs_generator_examples` - Complete code examples +- :ref:`api_odcs_generator` - API reference + +.. Made with Bob diff --git a/docs/chapters/06_odcs_generator/informatica_integration.rst b/docs/chapters/06_odcs_generator/informatica_integration.rst new file mode 100644 index 0000000..735691e --- /dev/null +++ b/docs/chapters/06_odcs_generator/informatica_integration.rst @@ -0,0 +1,93 @@ +.. + Copyright 2026 IBM Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. _odcs_generator_informatica: + +Informatica Integration +======================== + +Generate ODCS files from Informatica CDGC (Cloud Data Governance and Catalog) assets. + +Overview +-------- + +The Informatica integration extracts metadata from Informatica CDGC and generates ODCS v3.1.0 compliant YAML files. + +Features +-------- + +- ✅ Asset metadata extraction via REST API +- ✅ Column schema discovery +- ✅ System attribute handling +- ✅ Technical metadata extraction +- ✅ Business glossary term integration + +Installation +------------ + +.. code-block:: bash + + pip install -e . + +Configuration +------------- + +Environment Variables +~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + export INFORMATICA_URL="https://your-informatica-instance.com" + export INFORMATICA_USERNAME="your_username" + export INFORMATICA_PASSWORD="your_password" + +Usage +----- + +Command Line +~~~~~~~~~~~~ + +.. code-block:: bash + + python -m wxdi.odcs_generator.generate_odcs_from_informatica + +Python API +~~~~~~~~~~ + +.. code-block:: python + + from wxdi.odcs_generator.generate_odcs_from_informatica import InformaticaClient, ODCSGenerator + + # Initialize client + client = InformaticaClient( + base_url="https://your-informatica-instance.com", + username="your_username", + password="your_password" + ) + + # Create generator + generator = ODCSGenerator(client) + + # Generate ODCS + odcs_data = generator.generate_odcs("asset-id") + + # Save to file + generator.save_to_yaml(odcs_data, "output.yaml") + +See Also +-------- + +- :ref:`odcs_generator_examples` - Complete examples +- :ref:`api_odcs_generator` - API reference diff --git a/docs/chapters/06_odcs_generator/overview.rst b/docs/chapters/06_odcs_generator/overview.rst new file mode 100644 index 0000000..e75c68d --- /dev/null +++ b/docs/chapters/06_odcs_generator/overview.rst @@ -0,0 +1,345 @@ +.. + Copyright 2026 IBM Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. _odcs_generator_overview: + +ODCS Generator Overview +======================= + +The ODCS Generator module automates the creation of Open Data Contract Standard (ODCS) v3.1.0 compliant YAML files from enterprise data catalog metadata. + +Architecture +------------ + +The module uses a modular architecture with catalog-specific clients and a common generator: + +.. code-block:: text + + ┌─────────────────────────────────────────┐ + │ ODCS Generator Module │ + ├─────────────────────────────────────────┤ + │ │ + │ ┌──────────────┐ ┌───────────────┐ │ + │ │ Collibra │ │ Informatica │ │ + │ │ Client │ │ Client │ │ + │ └──────┬───────┘ └───────┬───────┘ │ + │ │ │ │ + │ └───────┬───────────┘ │ + │ │ │ + │ ┌───────▼────────┐ │ + │ │ ODCS Generator │ │ + │ └───────┬────────┘ │ + │ │ │ + │ ┌───────▼────────┐ │ + │ │ YAML Output │ │ + │ └────────────────┘ │ + └─────────────────────────────────────────┘ + +Core Components +--------------- + +Catalog Clients +~~~~~~~~~~~~~~~ + +**CollibraClient** + - REST API integration + - GraphQL API for classifications + - Asset, attribute, and relation extraction + - Tag and classification support + +**InformaticaClient** + - REST API integration + - Asset metadata extraction + - Column schema discovery + - System attribute handling + +ODCS Generator +~~~~~~~~~~~~~~ + +The generator transforms catalog metadata into ODCS format: + +1. **Metadata Extraction**: Fetch asset details from catalog +2. **Column Discovery**: Identify and extract column information +3. **Type Mapping**: Convert catalog types to ODCS types +4. **Classification Mapping**: Extract data classifications +5. **YAML Generation**: Create compliant ODCS YAML file + +Data Type Mapping +----------------- + +Logical Type Mapping +~~~~~~~~~~~~~~~~~~~~ + +Catalog types are mapped to ODCS logical types: + +.. list-table:: + :header-rows: 1 + :widths: 30 30 40 + + * - Catalog Type + - ODCS Logical Type + - Description + * - text, string, varchar + - string + - Text data + * - whole number, int, integer + - integer + - Whole numbers + * - decimal number, float, double + - number + - Decimal numbers + * - date time, timestamp + - timestamp + - Date and time + * - true/false, boolean + - boolean + - Boolean values + * - geographical, geo + - string + - Geographic data + +Physical Type Mapping +~~~~~~~~~~~~~~~~~~~~~ + +Physical types preserve database-specific information: + +- ``VARCHAR(255)`` - Variable character with length +- ``DECIMAL(10,2)`` - Decimal with precision and scale +- ``NUMBER(18,4)`` - Numeric with precision and scale +- ``TIMESTAMP(6)`` - Timestamp with precision + +Classification Support +---------------------- + +The generator extracts and maps data classifications: + +**Collibra Classifications** + - Extracted via GraphQL API + - Mapped to ODCS classification field + - Supports custom classification schemes + +**Informatica Classifications** + - Extracted from asset attributes + - Mapped to ODCS tags and classifications + - Supports data sensitivity labels + +Common Classifications +~~~~~~~~~~~~~~~~~~~~~~ + +- **PII** - Personally Identifiable Information +- **PHI** - Protected Health Information +- **Confidential** - Confidential business data +- **Public** - Publicly available data +- **Internal** - Internal use only + +ODCS Structure +-------------- + +Generated ODCS files follow this structure: + +Contract Metadata +~~~~~~~~~~~~~~~~~ + +.. code-block:: yaml + + id: unique-contract-id + kind: DataContract + apiVersion: v3.1.0 + domain: domain-name + dataProduct: product-name + version: 1.0.0 + name: contract-name + status: active + contractCreatedTs: 2026-04-16T06:00:00Z + +Description Section +~~~~~~~~~~~~~~~~~~~ + +.. code-block:: yaml + + description: + purpose: Purpose of the data + authoritativeDefinitions: + - type: collibra-asset + url: https://collibra.com/asset/123 + limitations: Usage limitations + usage: Intended usage + +Schema Section +~~~~~~~~~~~~~~ + +.. code-block:: yaml + + schema: + - id: table-id + name: table_name + physicalName: PHYSICAL_TABLE_NAME + physicalType: table + description: Table description + tags: + - customer-data + - analytics + columns: + - id: column-id + name: column_name + logicalType: string + physicalType: VARCHAR(255) + description: Column description + isNullable: false + isPrimaryKey: false + classification: PII + tags: + - sensitive + +Quality Section +~~~~~~~~~~~~~~~ + +.. code-block:: yaml + + quality: + - id: rule-001 + name: completeness-check + type: completeness + column: customer_id + dimension: completeness + threshold: 0.95 + +Service Level Agreement +~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: yaml + + sla: + interval: daily + uptime: 99.9% + responseTime: 100ms + +Best Practices +-------------- + +**Validate Catalog Connectivity** + +.. code-block:: python + + try: + client = CollibraClient(base_url, username, password) + # Test connection + asset = client.get_asset("test-id") + except Exception as e: + print(f"Connection failed: {e}") + +**Handle Missing Metadata** + +.. code-block:: python + + # Provide defaults for missing fields + odcs_data = generator.generate_odcs( + asset_id, + defaults={ + 'dataProduct': 'Default Product', + 'version': '1.0.0', + 'status': 'draft' + } + ) + +**Batch Processing** + +.. code-block:: python + + asset_ids = ['id1', 'id2', 'id3'] + + for asset_id in asset_ids: + try: + odcs_data = generator.generate_odcs(asset_id) + generator.save_to_yaml(odcs_data, f"{asset_id}-odcs.yaml") + except Exception as e: + print(f"Failed for {asset_id}: {e}") + +**Customize Output** + +.. code-block:: python + + # Generate ODCS + odcs_data = generator.generate_odcs(asset_id) + + # Customize before saving + odcs_data['dataProduct'] = 'My Data Product' + odcs_data['version'] = '2.0.0' + odcs_data['quality'] = [ + { + 'id': 'custom-rule', + 'name': 'Custom Quality Rule', + 'type': 'accuracy' + } + ] + + # Save customized ODCS + generator.save_to_yaml(odcs_data, 'custom-odcs.yaml') + +Error Handling +-------------- + +Common errors and solutions: + +**Authentication Errors** + +.. code-block:: python + + from requests.exceptions import HTTPError + + try: + client = CollibraClient(url, username, password) + except HTTPError as e: + if e.response.status_code == 401: + print("Invalid credentials") + elif e.response.status_code == 403: + print("Insufficient permissions") + +**Asset Not Found** + +.. code-block:: python + + try: + odcs_data = generator.generate_odcs(asset_id) + except ValueError as e: + print(f"Asset not found: {e}") + +**Missing Columns** + +.. code-block:: python + + odcs_data = generator.generate_odcs(asset_id) + + if not odcs_data.get('schema', [{}])[0].get('columns'): + print("Warning: No columns found for asset") + +Requirements +------------ + +- Python 3.8 or higher +- requests >= 2.32.4 +- pyyaml >= 5.4.0 +- urllib3 >= 2.6.3 +- python-dateutil >= 2.5.3 + +See Also +-------- + +- :ref:`odcs_generator_collibra` - Collibra integration +- :ref:`odcs_generator_informatica` - Informatica integration +- :ref:`odcs_generator_examples` - Code examples +- :ref:`api_odcs_generator` - API reference + +.. Made with Bob diff --git a/docs/chapters/07_data_product_recommender/examples.rst b/docs/chapters/07_data_product_recommender/examples.rst new file mode 100644 index 0000000..ae39ed9 --- /dev/null +++ b/docs/chapters/07_data_product_recommender/examples.rst @@ -0,0 +1,100 @@ +.. + Copyright 2026 IBM Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. _data_product_recommender_examples: + +Examples +======== + +Basic Example +------------- + +.. code-block:: python + + from wxdi.data_product_recommender.platforms import SnowflakeQueryParser + from wxdi.data_product_recommender.recommender import DataProductRecommender + + # Initialize with Snowflake parser + parser = SnowflakeQueryParser() + recommender = DataProductRecommender(parser) + + # Load and analyze query logs + recommender.load_query_logs_from_csv_file('query_logs.csv') + recommender.calculate_metrics() + + # Get top 20 recommendations + recommendations = recommender.recommend_data_products(num_recommendations=20) + + # Export to Markdown + recommender.export_recommendations_markdown(recommendations, 'output/recommendations.md') + + # Export to JSON + recommender.export_recommendations_json(recommendations, 'output/recommendations.json') + +Multi-Platform Example +---------------------- + +.. code-block:: python + + from wxdi.data_product_recommender.platforms import ( + SnowflakeQueryParser, + DatabricksQueryParser, + BigQueryQueryParser + ) + from wxdi.data_product_recommender.recommender import DataProductRecommender + + # Snowflake + snowflake_parser = SnowflakeQueryParser() + snowflake_recommender = DataProductRecommender(snowflake_parser) + snowflake_recommender.load_query_logs_from_csv_file('snowflake_logs.csv') + + # Databricks + databricks_parser = DatabricksQueryParser() + databricks_recommender = DataProductRecommender(databricks_parser) + databricks_recommender.load_query_logs_from_csv_file('databricks_logs.csv') + + # BigQuery + bigquery_parser = BigQueryQueryParser() + bigquery_recommender = DataProductRecommender(bigquery_parser) + bigquery_recommender.load_query_logs_from_csv_file('bigquery_logs.csv') + +Custom Scoring Weights +---------------------- + +.. code-block:: python + + from wxdi.data_product_recommender.platforms import SnowflakeQueryParser + from wxdi.data_product_recommender.recommender import DataProductRecommender + + parser = SnowflakeQueryParser() + recommender = DataProductRecommender(parser) + + # Customize scoring weights + recommender.weights = { + 'query_count': 0.5, # Emphasize query volume + 'user_diversity': 0.3, # Moderate user diversity + 'recency': 0.1, # Less emphasis on recency + 'consistency': 0.1 # Less emphasis on consistency + } + + recommender.load_query_logs_from_csv_file('query_logs.csv') + recommender.calculate_metrics() + recommendations = recommender.recommend_data_products(num_recommendations=20) + +See Also +-------- + +- :ref:`data_product_recommender_usage` - Usage guide +- :ref:`api_data_product_recommender` - API reference diff --git a/docs/chapters/07_data_product_recommender/index.rst b/docs/chapters/07_data_product_recommender/index.rst new file mode 100644 index 0000000..44eb21f --- /dev/null +++ b/docs/chapters/07_data_product_recommender/index.rst @@ -0,0 +1,111 @@ +.. + Copyright 2026 IBM Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. _data_product_recommender: + +Data Product Recommender +========================= + +Analyze database query logs to identify high-value tables and logical groupings for data product prioritization. + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + overview + usage_guide + examples + +Overview +-------- + +The ``data_product_recommender`` module analyzes query log files to identify which tables should be prioritized as data products in a data marketplace. + +Key Features +------------ + +**Multi-Platform Support** + Supports Snowflake, Databricks, BigQuery, and watsonx.data query log formats. + +**File-Based Input** + Works with CSV and JSON query log files (no direct database connection required). + +**Intelligent Scoring** + Combines query frequency, user diversity, recency, and consistency metrics. + +**Table Grouping** + Identifies tables frequently used together for logical data product groupings. + +**Multiple Output Formats** + Generates both Markdown (human-readable) and JSON (agent-consumable) reports. + +**CLI and Python API** + Use from command line or integrate into applications. + +Quick Start +----------- + +Command Line +~~~~~~~~~~~~ + +.. code-block:: bash + + python -m wxdi.data_product_recommender.cli \ + --platform snowflake \ + --input-file query_logs.csv \ + --output output \ + --num-recommendations 20 + +Python API +~~~~~~~~~~ + +.. code-block:: python + + from wxdi.data_product_recommender.platforms import SnowflakeQueryParser + from wxdi.data_product_recommender.recommender import DataProductRecommender + + # Initialize + parser = SnowflakeQueryParser() + recommender = DataProductRecommender(parser) + + # Load and analyze + recommender.load_query_logs_from_csv_file('query_logs.csv') + recommender.calculate_metrics() + recommendations = recommender.recommend_data_products(num_recommendations=20) + + # Export + recommender.export_recommendations_markdown(recommendations, 'output/recommendations.md') + +Use Cases +--------- + +**Accelerate Data Product Onboarding** + Leverage existing usage patterns rather than starting from scratch. + +**Identify High-Value Assets** + Find tables with demonstrated business value through real usage. + +**Discover Logical Groupings** + Identify tables commonly used together for cohesive data products. + +**Prioritize Catalog Promotion** + Focus efforts on tables with highest user demand and diversity. + +Next Steps +---------- + +- :ref:`data_product_recommender_usage` - Detailed usage guide +- :ref:`data_product_recommender_examples` - Code examples +- :ref:`api_data_product_recommender` - API reference diff --git a/docs/chapters/07_data_product_recommender/overview.rst b/docs/chapters/07_data_product_recommender/overview.rst new file mode 100644 index 0000000..7b46a50 --- /dev/null +++ b/docs/chapters/07_data_product_recommender/overview.rst @@ -0,0 +1,65 @@ +.. + Copyright 2026 IBM Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. _data_product_recommender_overview: + +Overview +======== + +The Data Product Recommender analyzes query logs to identify high-value tables for data product creation. + +Scoring Methodology +------------------- + +Individual Table Scoring (0-100) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- **37.5%** Query Count - Volume of usage +- **37.5%** User Diversity - Breadth of usage across teams +- **15%** Recency - Recent activity +- **10%** Consistency - Regular usage patterns + +Table Group Scoring (0-100) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- **30%** Cohesion - How tightly tables are connected +- **20%** Usage - Relative usage compared to other groups +- **15%** User Reach - Percentage of users querying the group +- **20%** Recency - Recent activity across tables +- **10%** Consistency - Regular usage patterns +- **5%** Size - Number of tables in the group + +Star Rating Scale +~~~~~~~~~~~~~~~~~ + +- ⭐⭐⭐⭐⭐ **Excellent (80-100)**: Implement immediately +- ⭐⭐⭐⭐ **Good (60-79)**: Medium priority +- ⭐⭐⭐ **Fair (40-59)**: Consider splitting or implement later +- ⭐⭐ **Weak (20-39)**: Reconsider grouping +- ⭐ **Poor (0-19)**: Do not implement + +Platform Support +---------------- + +- ✅ **Snowflake** - Export from SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY +- ✅ **Databricks** - Export from system.query.history +- ✅ **BigQuery** - Export from INFORMATION_SCHEMA.JOBS_BY_PROJECT +- ✅ **watsonx.data** - Export from system.runtime.queries + +See Also +-------- + +- :ref:`data_product_recommender_usage` - Usage guide +- :ref:`data_product_recommender_examples` - Examples diff --git a/docs/chapters/07_data_product_recommender/usage_guide.rst b/docs/chapters/07_data_product_recommender/usage_guide.rst new file mode 100644 index 0000000..e638e48 --- /dev/null +++ b/docs/chapters/07_data_product_recommender/usage_guide.rst @@ -0,0 +1,82 @@ +.. + Copyright 2026 IBM Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. _data_product_recommender_usage: + +Usage Guide +=========== + +Installation +------------ + +.. code-block:: bash + + pip install -e . + +CLI Usage +--------- + +.. code-block:: bash + + python -m wxdi.data_product_recommender.cli \ + --platform snowflake \ + --input-file query_logs.csv \ + --output output \ + --num-recommendations 20 \ + --min-score 60.0 + +Options +~~~~~~~ + +- ``--platform`` - Database platform (snowflake, databricks, bigquery, watsonxdata) +- ``--input-file`` - Path to CSV or JSON query log file +- ``--output`` - Output directory (default: output) +- ``--output-format`` - Output format: markdown or json (default: markdown) +- ``--num-recommendations`` - Number of recommendations (default: 20) +- ``--min-score`` - Minimum score threshold 0-100 + +Python API +---------- + +.. code-block:: python + + from wxdi.data_product_recommender.platforms import SnowflakeQueryParser + from wxdi.data_product_recommender.recommender import DataProductRecommender + + # Initialize + parser = SnowflakeQueryParser() + recommender = DataProductRecommender(parser) + + # Load query logs + recommender.load_query_logs_from_csv_file('query_logs.csv') + + # Calculate metrics + recommender.calculate_metrics() + + # Get recommendations + recommendations = recommender.recommend_data_products( + num_recommendations=20, + min_score=60.0 + ) + + # Export results + recommender.export_recommendations_markdown(recommendations, 'output/recommendations.md') + recommender.export_recommendations_json(recommendations, 'output/recommendations.json') + +See Also +-------- + +- :ref:`data_product_recommender_examples` - Complete examples +- :ref:`api_data_product_recommender` - API reference diff --git a/docs/chapters/05_future_modules/index.rst b/docs/chapters/08_future_modules/index.rst similarity index 96% rename from docs/chapters/05_future_modules/index.rst rename to docs/chapters/08_future_modules/index.rst index 9eb46fb..38136f0 100644 --- a/docs/chapters/05_future_modules/index.rst +++ b/docs/chapters/08_future_modules/index.rst @@ -1,108 +1,108 @@ -.. - Copyright 2026 IBM Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - -.. _future_modules: - -Future Modules -============== - -The ``IBM watsonx.data intelligence SDK`` is designed with a modular architecture that allows different teams to contribute specialized functionality while sharing common components like authentication. - -Architecture for Extensibility -------------------------------- - -The SDK's modular design enables: - -* **Independent Development**: Teams can develop modules independently -* **Shared Infrastructure**: All modules use common authentication and configuration -* **Consistent API**: Modules follow the same design patterns -* **Easy Integration**: New modules integrate seamlessly with existing ones - -Adding New Modules ------------------- - -Teams adding new modules should: - -1. **Use Common Authentication**: Leverage the ``common.auth`` module for authentication -2. **Follow Naming Conventions**: Use clear, descriptive module names -3. **Provide Documentation**: Include comprehensive documentation following this structure -4. **Include Examples**: Provide working code examples -5. **Add Tests**: Include unit and integration tests - -Documentation Structure for New Modules ----------------------------------------- - -When adding a new module, create documentation following this pattern: - -.. code-block:: text - - docs/chapters/0X_module_name/ - ├── index.rst # Module overview - ├── core_concepts.rst # Key concepts - ├── usage.rst # Usage guide - ├── examples.rst # Code examples - └── api_reference.rst # API documentation - -API Reference Structure -~~~~~~~~~~~~~~~~~~~~~~~ - -Add API reference documentation: - -.. code-block:: text - - docs/api/module_name/ - ├── index.rst # API overview - ├── classes.rst # Main classes - └── utilities.rst # Utility functions - -Planned Modules ---------------- - -While specific modules are still being defined, potential areas include: - -* Data profiling and statistics -* Data lineage tracking -* Data catalog integration -* Additional data quality features -* Custom analytics capabilities - -Contact -------- - -If your team is planning to add a module to the SDK: - -* Review the existing module structure (``dq_validator``) -* Follow the authentication patterns in ``common.auth`` -* Coordinate with the SDK maintainers -* Submit documentation along with your module - -For questions or to propose a new module: - -* Email: Data_Intelligence_SDK@wwpdl.vnet.ibm.com -* GitHub: Open an issue or discussion - -Contributing ------------- - -See the CONTRIBUTING.md file in the repository for detailed guidelines on: - -* Code style and standards -* Testing requirements -* Documentation requirements -* Pull request process - -We look forward to growing the SDK with contributions from teams across IBM! - -.. Made with Bob +.. + Copyright 2026 IBM Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. _future_modules: + +Future Modules +============== + +The ``IBM watsonx.data intelligence SDK`` is designed with a modular architecture that allows different teams to contribute specialized functionality while sharing common components like authentication. + +Architecture for Extensibility +------------------------------- + +The SDK's modular design enables: + +* **Independent Development**: Teams can develop modules independently +* **Shared Infrastructure**: All modules use common authentication and configuration +* **Consistent API**: Modules follow the same design patterns +* **Easy Integration**: New modules integrate seamlessly with existing ones + +Adding New Modules +------------------ + +Teams adding new modules should: + +1. **Use Common Authentication**: Leverage the ``common.auth`` module for authentication +2. **Follow Naming Conventions**: Use clear, descriptive module names +3. **Provide Documentation**: Include comprehensive documentation following this structure +4. **Include Examples**: Provide working code examples +5. **Add Tests**: Include unit and integration tests + +Documentation Structure for New Modules +---------------------------------------- + +When adding a new module, create documentation following this pattern: + +.. code-block:: text + + docs/chapters/0X_module_name/ + ├── index.rst # Module overview + ├── core_concepts.rst # Key concepts + ├── usage.rst # Usage guide + ├── examples.rst # Code examples + └── api_reference.rst # API documentation + +API Reference Structure +~~~~~~~~~~~~~~~~~~~~~~~ + +Add API reference documentation: + +.. code-block:: text + + docs/api/module_name/ + ├── index.rst # API overview + ├── classes.rst # Main classes + └── utilities.rst # Utility functions + +Planned Modules +--------------- + +While specific modules are still being defined, potential areas include: + +* Data profiling and statistics +* Data lineage tracking +* Data catalog integration +* Additional data quality features +* Custom analytics capabilities + +Contact +------- + +If your team is planning to add a module to the SDK: + +* Review the existing module structure (``dq_validator``) +* Follow the authentication patterns in ``common.auth`` +* Coordinate with the SDK maintainers +* Submit documentation along with your module + +For questions or to propose a new module: + +* Email: Data_Intelligence_SDK@wwpdl.vnet.ibm.com +* GitHub: Open an issue or discussion + +Contributing +------------ + +See the CONTRIBUTING.md file in the repository for detailed guidelines on: + +* Code style and standards +* Testing requirements +* Documentation requirements +* Pull request process + +We look forward to growing the SDK with contributions from teams across IBM! + +.. Made with Bob diff --git a/docs/index.rst b/docs/index.rst index 151d37b..428f467 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -16,12 +16,15 @@ IBM watsonx.data intelligence SDK for Python ============================================ -The ``IBM watsonx.data intelligence SDK`` for Python is a comprehensive toolkit for data intelligence operations, providing modular components for data quality validation, authentication, and more. +The ``IBM watsonx.data intelligence SDK`` for Python is a comprehensive toolkit for data intelligence operations, providing modular components for data quality validation, data product management, ODCS generation, and intelligent recommendations. This SDK is designed with a modular architecture, allowing different teams to contribute specialized functionality while sharing common components like authentication. Currently, the SDK includes: * **Common Modules**: Shared authentication and configuration for all SDK modules * **DQ Validator**: In-memory data quality validation for streaming data, Pandas DataFrames, and PySpark DataFrames +* **DPH Services**: Python client for IBM Data Product Hub API +* **ODCS Generator**: Generate Open Data Contract Standard files from data catalogs +* **Data Product Recommender**: Analyze query logs to identify high-value data products The ``IBM watsonx.data intelligence SDK`` is supported on Python 3.8+. @@ -31,6 +34,15 @@ Key Features **Data Quality Validation** Comprehensive validation framework with 9 check types, support for array-based records and DataFrames, and integration with IBM Cloud Pak for Data. +**Data Product Hub Integration** + Complete Python SDK for managing data products, drafts, releases, contract terms, and domains. + +**ODCS Generation** + Automated generation of ODCS v3.1.0 compliant YAML files from Collibra and Informatica catalogs. + +**Intelligent Recommendations** + Query log analysis to identify high-value tables and logical groupings for data product prioritization. + **Multi-Environment Authentication** Unified authentication supporting IBM Cloud, AWS Cloud, Government Cloud, and on-premises deployments. @@ -48,7 +60,10 @@ Key Features chapters/02_overview/index chapters/03_common_modules/index chapters/04_dq_validator/index - chapters/05_future_modules/index + chapters/05_dph_services/index + chapters/06_odcs_generator/index + chapters/07_data_product_recommender/index + chapters/08_future_modules/index api/index .. Made with Bob diff --git a/src/wxdi/dph_services/dph_v1.py b/src/wxdi/dph_services/dph_v1.py index a05cec9..d7908ff 100644 --- a/src/wxdi/dph_services/dph_v1.py +++ b/src/wxdi/dph_services/dph_v1.py @@ -1392,9 +1392,7 @@ def replace_data_product_draft_contract_terms( contract. :param List[ContractTemplateSLA] sla: (optional) Service Level Agreement details. - :param List[ContractTemplateSupportAndCommunication] - support_and_communication: (optional) Support and communication details for - the contract. + :param List[ContractTemplateSupportAndCommunication] support_and_communication: (optional) Support and communication details for the contract. :param List[ContractTemplateCustomProperty] custom_properties: (optional) Custom properties that are not part of the standard contract. :param ContractTest contract_test: (optional) Contains the contract test