diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..f2f205f --- /dev/null +++ b/.dockerignore @@ -0,0 +1,5 @@ +# ignore .git and .cache folders +.git +.cache +.idea +data diff --git a/.gitignore b/.gitignore index e272905..bdca348 100644 --- a/.gitignore +++ b/.gitignore @@ -1,109 +1,110 @@ -*# -*~ -.idea/ - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -env/ -ve/ -venv/ -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -*.egg-info/ -.installed.cfg -*.egg - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -.hypothesis/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# pyenv -.python-version - -# celery beat schedule file -celerybeat-schedule - -# SageMath parsed files -*.sage.py - -# dotenv -.env - -# virtualenv -.venv -venv/ -ENV/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ - -benchmarks +*# +*~ +.idea/ +data/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +ve/ +venv/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# dotenv +.env + +# virtualenv +.venv +venv/ +ENV/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +benchmarks diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..fbca14b --- /dev/null +++ b/Dockerfile @@ -0,0 +1,38 @@ +# CC Specific Dockerfile implementing steps at https://github.com/LexPredict/openedgar/blob/master/INSTALL.md +# Allows the use of OpenEDGAR in AKS +FROM ubuntu:18.04 +MAINTAINER Michael Seddon (michael.seddon@cliffordchance.com) + +# Environment variables +ENV DEBIAN_FRONTEND=noninteractive + +# Package installation +RUN apt update +RUN apt upgrade -y +RUN apt install -y software-properties-common build-essential python3-dev python3-pip virtualenv git-all +# to be removed when rabbit is in its own container +RUN apt install -y rabbitmq-server +RUN apt-get install -y openjdk-8-jdk + +# Clone OpenEDGAR repository +WORKDIR /opt +RUN mkdir /opt/openedgar + +# Set up Python venv +WORKDIR /opt/openedgar/ +RUN virtualenv -p /usr/bin/python3 env +COPY lexpredict_openedgar/requirements/full.txt lexpredict_openedgar/requirements/full.txt +RUN ./env/bin/pip install -r lexpredict_openedgar/requirements/full.txt +RUN ./env/bin/pip install azure-mgmt-resource azure-mgmt-datalake-store azure-datalake-store azure-storage-blob +COPY tika/tika-server-1.21.jar /opt/openedgar/tika/tika-server-1.21.jar +COPY lexpredict_openedgar/ /opt/openedgar/lexpredict_openedgar/ + +COPY docker/default.env /opt/openedgar/ +RUN cp lexpredict_openedgar/sample.env lexpredict_openedgar/.env +#COPY docker/erlang-solutions_1.0_all.deb lexpredict_openedgar/erlang-solutions_1.0_all.deb +COPY docker/oe-entrypoint.sh /usr/local/bin/ +COPY docker/run_edgar.py /opt/openedgar/lexpredict_openedgar/run_edgar.py +COPY docker/dot_env.sh /opt/openedgar +RUN mkdir /data + +ENTRYPOINT ["oe-entrypoint.sh"] \ No newline at end of file diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 0000000..989ec2e --- /dev/null +++ b/docker/README.md @@ -0,0 +1,38 @@ +# Dockerisation +The image will take as default parameters [default.env](docker/default.env) + +All the variable can be substitute at runtime as environment variables + +## Download tika +run in `/tika` the script `download_tika.sh` it will download in the `/tika' +folder tika version 1.20 + +## Docker +Run the follow from the repository root for creating the image: + + docker build -t dslcr.azurecr.io/openedgar:1.1 . + +# Run container +It is wise to mount a local folder to the container for being able to access to the +downloaded documents. +Example: + + docker run --env-file vars.txt -v /Users/mirko/Projects/research-openedgar/data:/data dslcr.azurecr.io/openedgar:1.1 + +Contents of vars.txt + + EDGAR_YEAR=2015 + EDGAR_QUARTER=1 + EDGAR_MONTH=1 + CLIENT_TYPE=Local + S3_DOCUMENT_PATH=/data + DOWNLOAD_PATH=/data + +After the download is terimated you have to stop the container: + + $ docker ps + + CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES + 9e0ae247b61f dslcr.azurecr.io/openedgar:1.1 "oe-entrypoint.sh" 2 minutes ago Up 2 minutes priceless_bardeen + + $ docker kill 9e diff --git a/docker/default.env b/docker/default.env new file mode 100755 index 0000000..65f3a67 --- /dev/null +++ b/docker/default.env @@ -0,0 +1,70 @@ +# PostgreSQL +DATABASE_URL=${DATABASE_URL:="postgres://postgres:postgres@host.docker.internal:5432/openedgar"} +CELERY_BROKER_URL=${CELERY_BROKER_URL:="amqp://openedgar:openedgar@localhost:5672/openedgar"} +CELERY_RESULT_BACKEND=${CELERY_RESULT_BACKEND:="rpc"} +CELERY_RESULT_PERSISTENT=${CELERY_RESULT_PERSISTENT:="False"} +DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY:="openedgar"} + +# Domain name, used by caddy +#DOMAIN_NAME=domain.com + +# General settings +# DJANGO_READ_DOT_ENV_FILE=True +# CLIENT_TYPE: S3, ADL, Local, Blob +CLIENT_TYPE=${CLIENT_TYPE:="Local"} + + +DJANGO_ADMIN_URL=${DJANGO_ADMIN_URL=""} +DJANGO_SETTINGS_MODULE=${DJANGO_SETTINGS_MODULE:="config.settings.production"} +DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY:="openedgar"} +DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS:="localhost"} + +# AWS Settings +DJANGO_AWS_ACCESS_KEY_ID=${DJANGO_AWS_ACCESS_KEY_ID:=""} +DJANGO_AWS_SECRET_ACCESS_KEY=${DJANGO_AWS_SECRET_ACCESS_KEY:=""} +DJANGO_AWS_STORAGE_BUCKET_NAME=${DJANGO_AWS_STORAGE_BUCKET_NAME:=""} + +# AZURE DLAKE Settings +ADL_ACCOUNT=${ADL_ACCOUNT:=""} +ADL_TENANT=${ADL_TENANT:=""} +# Client ID +ADL_CID=${ADL_CID:=""} +# Client secret/password +ADL_SECRET=${ADL_SECRET:=""} + +# Azure Blob Storage +BLOB_CONNECTION_STRING=${BLOB_CONNECTION_STRING:=""} +BLOB_CONTAINER=${BLOB_CONTAINER:="openedgar"} + + +# Read rate limit +CELERY_TASK_DEFAULT_RATE_LIMIT={$CELERY_TASK_DEFAULT_RATE_LIMIT:="10/s"} + +# Used with email +DJANGO_MAILGUN_API_KEY=${DJANGO_MAILGUN_API_KEY:=""} +DJANGO_SERVER_EMAIL=${DJANGO_SERVER_EMAIL:=""} +MAILGUN_SENDER_DOMAIN=${MAILGUN_SENDER_DOMAIN:=""} +EMAIL_BACKEND=${EMAIL_BACKEND:="django.core.mail.backends.console.EmailBackend"} + +# Security! Better to use DNS for this task, but you can use redirect +DJANGO_SECURE_SSL_REDIRECT=${DJANGO_SECURE_SSL_REDIRECT:="False"} + +# django-allauth +DJANGO_ACCOUNT_ALLOW_REGISTRATION=${DJANGO_ACCOUNT_ALLOW_REGISTRATION:="True"} + +# AWS setup +S3_ACCESS_KEY=${S3_ACCESS_KEY:="ABCDEFGHIJKLMNOPQRST"} +S3_SECRET_KEY=${S3_SECRET_KEY:="abcdefghijklmnopqrstuvwxyz12345678901234"} +S3_BUCKET=${S3_BUCKET:=""} + +S3_PREFIX=${S3_PREFIX:="DATA"} +S3_COMPRESSION_LEVEL=${S3_COMPRESSION_LEVEL:="9"} + +# Download path +DOWNLOAD_PATH=${DOWNLOAD_PATH:="/data"} +S3_DOCUMENT_PATH=${S3_DOCUMENT_PATH:="/data"} + +# EDGAR PARAMETERS +EDGAR_YEAR=${EDGAR_YEAR:="2015"} +FORM_TYPES=${FORM_TYPES:="3, 10, 8-K, 10-Q, 10-K"} + diff --git a/docker/dot_env.sh b/docker/dot_env.sh new file mode 100755 index 0000000..4555d96 --- /dev/null +++ b/docker/dot_env.sh @@ -0,0 +1,70 @@ +#!/bin/bash + +cat >/opt/openedgar/lexpredict_openedgar/.env << EOF +DATABASE_URL=$DATABASE_URL +CELERY_BROKER_URL=$CELERY_BROKER_URL +CELERY_RESULT_BACKEND=$CELERY_RESULT_BACKEND +CELERY_RESULT_PERSISTENT=$CELERY_RESULT_PERSISTENT +DJANGO_SECRET_KEY=$DJANGO_SECRET_KEY + +# Domain name, used by caddy +#DOMAIN_NAME=domain.com + +# General settings +# DJANGO_READ_DOT_ENV_FILE=True +# CLIENT_TYPE: AKS, ADLAKE, Local +CLIENT_TYPE=$CLIENT_TYPE + + +DJANGO_ADMIN_URL=$DJANGO_ADMIN_URL +DJANGO_SETTINGS_MODULE=$DJANGO_SETTINGS_MODULE +DJANGO_SECRET_KEY=$DJANGO_SECRET_KEY +DJANGO_ALLOWED_HOSTS=$DJANGO_ALLOWED_HOSTS + +CELERY_TASK_DEFAULT_RATE_LIMIT=$CELERY_TASK_DEFAULT_RATE_LIMIT + +# data Lake Settings +ADL_ACCOUNT=$ADL_ACCOUNT +ADL_TENANT=$ADL_TENANT +# Client ID +ADL_CID=$ADL_CID +# Client secret/password +ADL_SECRET=$ADL_SECRET + +# Azure Blob Storage +BLOB_CONNECTION_STRING=$BLOB_CONNECTION_STRING +BLOB_CONTAINER=$BLOB_CONTAINER + +# AWS Settings +DJANGO_AWS_ACCESS_KEY_ID=$DJANGO_AWS_ACCESS_KEY_ID +DJANGO_AWS_SECRET_ACCESS_KEY=$DJANGO_AWS_SECRET_ACCESS_KEY +DJANGO_AWS_STORAGE_BUCKET_NAME=$DJANGO_AWS_STORAGE_BUCKET_NAME + +# Used with email +DJANGO_MAILGUN_API_KEY=$DJANGO_MAILGUN_API_KEY +DJANGO_SERVER_EMAIL=$DJANGO_SERVER_EMAIL +MAILGUN_SENDER_DOMAIN=$MAILGUN_SENDER_DOMAIN +EMAIL_BACKEND=$EMAIL_BACKEND + +# Security! Better to use DNS for this task, but you can use redirect +DJANGO_SECURE_SSL_REDIRECT=$DJANGO_SECURE_SSL_REDIRECT + +# django-allauth +DJANGO_ACCOUNT_ALLOW_REGISTRATION=$DJANGO_ACCOUNT_ALLOW_REGISTRATION + +# AWS setup +S3_ACCESS_KEY=$S3_ACCESS_KEY +S3_SECRET_KEY=$S3_SECRET_KEY +S3_BUCKET=$S3_BUCKET + +S3_PREFIX=$S3_PREFIX +S3_COMPRESSION_LEVEL=$S3_COMPRESSION_LEVEL + +# Download path +DOWNLOAD_PATH=$DOWNLOAD_PATH +S3_DOCUMENT_PATH=$S3_DOCUMENT_PATH + +# EDGAR PARAMETERS +EDGAR_YEAR=$EDGAR_YEAR +FORM_TYPES=$FORM_TYPES +EOF \ No newline at end of file diff --git a/docker/erlang-solutions_1.0_all.deb b/docker/erlang-solutions_1.0_all.deb new file mode 100644 index 0000000..6cf8fef Binary files /dev/null and b/docker/erlang-solutions_1.0_all.deb differ diff --git a/docker/oe-entrypoint.sh b/docker/oe-entrypoint.sh new file mode 100755 index 0000000..03803ef --- /dev/null +++ b/docker/oe-entrypoint.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +cd /opt/openedgar/lexpredict_openedgar +export PYTHONIOENCODING=utf-8 +mkdir -p /data/logs + +source ../env/bin/activate +source /opt/openedgar/default.env + +source /opt/openedgar/dot_env.sh + +export C_FORCE_ROOT="true" + +service rabbitmq-server start + +rabbitmqctl add_user openedgar openedgar + +rabbitmqctl add_vhost openedgar + +rabbitmqctl set_permissions -p openedgar openedgar ".*" ".*" ".*" + +cd /opt/openedgar/tika + +java -jar tika-server-1.21.jar > /data/logs/tika.log 2>&1 & + +cd /opt/openedgar/lexpredict_openedgar + +source ../env/bin/activate +source /opt/openedgar/default.env + +# perform initial migration +python manage.py migrate + +celery -A lexpredict_openedgar.taskapp worker --loglevel=INFO > /data/logs/celery.log 2>&1 & + +python manage.py shell < run_edgar.py + +tail -f /data/logs/celery.log +#| grep -v "INFO rmeta/text (autodetecting type)" diff --git a/docker/run_edgar.py b/docker/run_edgar.py new file mode 100644 index 0000000..d0fca4e --- /dev/null +++ b/docker/run_edgar.py @@ -0,0 +1,21 @@ +import logging +import os +from openedgar.processes.edgar import download_filing_index_data, process_all_filing_index + +years = [t.upper().strip() for t in os.getenv("EDGAR_YEAR").split(",")] +quarter = os.getenv("EDGAR_QUARTER") +month = os.getenv("EDGAR_MONTH") +types = [t.upper().strip() for t in os.getenv("FORM_TYPES").split(",")] + +if len(quarter) == 0: + quarter = None +if len(month) == 0: + month = None + +print("Edgar analysis started") +for year in years: + print("Analysing year {} types {}".format(year, types)) + process_all_filing_index(year=year, quarter=quarter, month=month, form_type_list=types) + +for i in range(1, 10): + print("###############################################") diff --git a/lexpredict_openedgar/config/settings/base.py b/lexpredict_openedgar/config/settings/base.py index 02c2ad3..3f89365 100755 --- a/lexpredict_openedgar/config/settings/base.py +++ b/lexpredict_openedgar/config/settings/base.py @@ -332,8 +332,21 @@ S3_DOCUMENT_PATH = env('S3_DOCUMENT_PATH', default="openedgar") S3_PREFIX = env('S3_PREFIX', default="documents") S3_COMPRESSION_LEVEL = int(env('S3_COMPRESSION_LEVEL', default=6)) +DOWNLOAD_PATH = env('DOWNLOAD_PATH', default='openedgar') # Tika configuration TIKA_HOST = "localhost" TIKA_PORT = 9998 TIKA_ENDPOINT = "http://{0}:{1}/tika".format(TIKA_HOST, TIKA_PORT) + +# Azure data Lake +ADL_ACCOUNT = env('ADL_ACCOUNT', default="") +ADL_TENANT = env('ADL_TENANT', default="") +# Client ID +ADL_CID = env('ADL_CID', default="") +# Client secret/password +ADL_SECRET = env('ADL_SECRET', default="") + +# Azure Blob Storage +BLOB_CONNECTION_STRING = env('BLOB_CONNECTION_STRING', default="") +BLOB_CONTAINER = env("BLOB_CONTAINER", default="openedgar") diff --git a/lexpredict_openedgar/openedgar/clients/adl.py b/lexpredict_openedgar/openedgar/clients/adl.py new file mode 100755 index 0000000..5b2b535 --- /dev/null +++ b/lexpredict_openedgar/openedgar/clients/adl.py @@ -0,0 +1,123 @@ +# Libraries +import logging +import os +from typing import Union +import adal + +from msrestazure.azure_active_directory import AADTokenCredentials +from azure.datalake.store import core, lib, multithread +from config.settings.base import ADL_ACCOUNT, ADL_TENANT, ADL_CID, ADL_SECRET + +logger = logging.getLogger(__name__) + +if os.environ["CLIENT_TYPE"] == "ADL": + authority_host_uri = 'https://login.microsoftonline.com' + authority_uri = authority_host_uri + '/' + ADL_TENANT + RESOURCE = 'https://management.core.windows.net/' + + adlCreds = lib.auth(tenant_id=ADL_TENANT, + client_secret=ADL_SECRET, + client_id=ADL_CID, + resource=RESOURCE) + + context = adal.AuthenticationContext(authority_uri, api_version=None) + mgmt_token = context.acquire_token_with_client_credentials(RESOURCE, ADL_CID, ADL_SECRET) + armCreds = AADTokenCredentials(mgmt_token, ADL_CID, resource=RESOURCE) + + ## Create a filesystem client object + adlsFileSystemClient = core.AzureDLFileSystem(adlCreds, store_name=ADL_ACCOUNT) + + +class ADLClient: + """ + TODO: This class does not support the deflate option + """ + def __init__(self): + logger.info("Initialized AKS client") + + def path_exists(self, path: str, client=None): + """ + Check if an AKS path exists + :param path: + :return: true if AKS object exists, else false + """ + + return adlsFileSystemClient.exists(path) + + def get_buffer(self, remote_path: str, client=None, deflate: bool = True): + """ + Get a file from S3 given a path and optional client. + :param remote_path: S3 path under bucket + :param client: optional client to re-use + :param deflate: whether to automatically zlib deflate contents + :return: buffer bytes/str + """ + + with adlsFileSystemClient.open(remote_path, blocksize=2 ** 20) as f: + return f.read() + + def get_file(self, remote_path: str, local_path: str, client=None, deflate: bool = True): + """ + Save a local file from AKS given a path and optional client. + :param remote_path: AKS path under bucket + :param local_path: local path to save to + :param client: optional client to re-use + :param deflate: whether to automatically zlib deflate contents + :return: + """ + # Open and write buffer + with open(local_path, "wb") as out_file: + out_file.write(self.get_buffer(remote_path, client, deflate)) + + def get_buffer_segment(self, remote_path: str, start_pos: int, end_pos: int, client=None, deflate: bool = True): + """ + Get a file from S3 given a path and optional client. + :param remote_path: S3 path under bucket + :param start_pos: + :param end_pos: + :param client: optional client to re-use + :param deflate: whether to automatically zlib deflate contents + :return: + """ + # Retrieve buffer and return subset + buffer = self.get_buffer(remote_path, client, deflate) + return buffer[start_pos:end_pos] + + def put_buffer(self, remote_path: str, buffer: Union[str, bytes], client=None, deflate: bool = True, + write_bytes=True): + """ + Upload a buffer to AKS given a path. + :param remote_path: AKS path + :param buffer: buffer to upload + :param client: optional client to re-use + :param deflate: whether to automatically zlib deflate contents + :return: + """ + + import tempfile + + temp = tempfile.NamedTemporaryFile(mode='w+b') + + try: + # Ensure we have bytes object + if isinstance(buffer, str): + upload_buffer = bytes(buffer, "utf-8") + elif isinstance(buffer, bytes): + upload_buffer = buffer + + temp.write(upload_buffer) + + multithread.ADLUploader(adlsFileSystemClient, lpath=temp.name, rpath=remote_path, nthreads=64, + overwrite=True, buffersize=4194304, blocksize=4194304) + finally: + temp.close() + + def put_file(self, remote_path: str, local_path: str): + """ + Save a local file from AKS. + :param remote_path: AKS remote path + :param local_path: local path to save to + :return: + """ + multithread.ADLUploader(adlsFileSystemClient, lpath=local_path, rpath=remote_path, nthreads=64, overwrite=True, + buffersize=4194304, blocksize=4194304) diff --git a/lexpredict_openedgar/openedgar/clients/blob.py b/lexpredict_openedgar/openedgar/clients/blob.py new file mode 100755 index 0000000..d78d51f --- /dev/null +++ b/lexpredict_openedgar/openedgar/clients/blob.py @@ -0,0 +1,106 @@ +# Libraries +import logging +import os +import zlib +from typing import Union + +from azure.storage.blob import BlockBlobService + +from config.settings.base import BLOB_CONNECTION_STRING, BLOB_CONTAINER, S3_COMPRESSION_LEVEL + +logger = logging.getLogger(__name__) +# Remove garbage logging from MS +logging.getLogger("azure.storage.common.storageclient").setLevel(logging.ERROR) + +if os.environ["CLIENT_TYPE"] == "Blob": + blob_service = BlockBlobService(connection_string=BLOB_CONNECTION_STRING) + blob_service.create_container(BLOB_CONTAINER) + + +class BlobClient: + + def __init__(self): + logger.info("Initialized Blob client") + + def path_exists(self, path: str, client=None): + """ + Check if an Blob path exists + :param path: + :return: true if Blob object exists, else false + """ + return blob_service.exists(BLOB_CONTAINER, path) + + def get_buffer(self, remote_path: str, client=None, deflate: bool = True): + """ + Get a file from Blob given a path and optional client. + :param remote_path: Blob path under bucket + :param client: optional client to re-use + :param deflate: whether to automatically zlib deflate contents + :return: buffer bytes/str + """ + if remote_path[0] == '/': + remote_path = remote_path[1:] + buffer = blob_service.get_blob_to_bytes(BLOB_CONTAINER, remote_path).content + if deflate: + return zlib.decompress(buffer) + else: + return buffer + + def get_file(self, remote_path: str, local_path: str, client=None, deflate: bool = True): + """ + Save a local file from Blob given a path and optional client. + :param remote_path: Blob path under bucket + :param local_path: local path to save to + :param client: optional client to re-use + :param deflate: whether to automatically zlib deflate contents + :return: + """ + with open(local_path, "wb") as out_file: + out_file.write(self.get_buffer(remote_path, client, deflate)) + + def get_buffer_segment(self, remote_path: str, start_pos: int, end_pos: int, client=None, deflate: bool = True): + """ + Get a file from S3 given a path and optional client. + :param remote_path: S3 path under bucket + :param start_pos: + :param end_pos: + :param client: optional client to re-use + :param deflate: whether to automatically zlib deflate contents + :return: + """ + # Retrieve buffer and return subset + buffer = self.get_buffer(remote_path, client, deflate) + return buffer[start_pos:end_pos] + + def put_buffer(self, remote_path: str, buffer: Union[str, bytes], client=None, deflate: bool = True, + write_bytes=True): + """ + Upload a buffer to AKS given a path. + :param remote_path: AKS path + :param buffer: buffer to upload + :param client: optional client to re-use + :param deflate: whether to automatically zlib deflate contents + :return: + """ + # Ensure we have bytes object + if isinstance(buffer, str): + upload_buffer = bytes(buffer, "utf-8") + else: + upload_buffer = buffer + if remote_path[0] == '/': + remote_path = remote_path[1:] + if deflate: + upload_buffer = zlib.compress(upload_buffer, S3_COMPRESSION_LEVEL) + blob_service.create_blob_from_bytes(BLOB_CONTAINER, remote_path, upload_buffer) + + def put_file(self, remote_path: str, local_path: str, client=None, deflate: bool = True): + """ + Save a local file from AKS. + :param remote_path: AKS remote path + :param local_path: local path to save to + :param client: optional client to re-use + :param deflate: whether to automatically zlib deflate contents + :return: + """ + with open(local_path, "rb") as in_file: + self.put_buffer(remote_path, in_file.read(), client, deflate) diff --git a/lexpredict_openedgar/openedgar/clients/edgar.py b/lexpredict_openedgar/openedgar/clients/edgar.py index 70496d5..30d6015 100755 --- a/lexpredict_openedgar/openedgar/clients/edgar.py +++ b/lexpredict_openedgar/openedgar/clients/edgar.py @@ -39,13 +39,6 @@ # Setup logger logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) -console = logging.StreamHandler() -console.setLevel(logging.INFO) -formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s') -console.setFormatter(formatter) -logger.addHandler(console) - def get_buffer(remote_path: str, base_path: str = HTTP_SEC_HOST): """ @@ -185,6 +178,91 @@ def list_index_by_year(year: int): return form_index_list +def list_index_by_quarter(year: int, quarter: int): + """ + Get list of index files for a given year and quarter. + :param year: filing year to retrieve + :param quarter: filing quarter to retrieve + :return: + """ + # Log entrance + logger.info("Locating form index list for {0}".format(year)) + + # Form index list + year = str(year) + form_index_list = [] + + # Get year directory list + year_index_uri = urllib.parse.urljoin(HTTP_SEC_INDEX_PATH, str(year) + "/") + year_root_list = list_path(year_index_uri) + print(year_root_list) + + # Get quarters + quarter_list = [f for f in year_root_list if "/QTR" in f] + + quarter_string_match = "QTR" + str(quarter) + + # Iterate over quarters + for qu in quarter_list: + if quarter_string_match in qu: + quarter_root_list = list_path(qu) + form_index_list.extend([q for q in quarter_root_list if "/form." in q.lower()]) + + # Cleanup double / + for i in range(len(form_index_list)): + form_index_list[i] = form_index_list[i].replace("//", "/") + + # Log exit + logger.info("Successfully located {0} form index files for {1}".format(len(form_index_list), year)) + + # Return + return form_index_list + + +def list_index_by_month(year: int, month: int): + """ + Get list of index files for a given year and month. + :param year: filing year to retrieve + :param month: filing month to retrieve + :return: + """ + # Log entrance + logger.info("Locating form index list for {0}".format(year)) + + # Form index list + year = str(year) + form_index_list = [] + + # Get year directory list + year_index_uri = urllib.parse.urljoin(HTTP_SEC_INDEX_PATH, str(year) + "/") + year_root_list = list_path(year_index_uri) + print(year_root_list) + + # Get quarters + quarter_list = [f for f in year_root_list if "/QTR" in f] + + # company.20190102.idx + if int(month) < 10: + month = "0{}".format(month) + + date_string_match = year + month + + # Iterate over quarters + for qu in quarter_list: + quarter_root_list = list_path(qu) + form_index_list.extend([q for q in quarter_root_list if "/form." + date_string_match in q.lower()]) + + # Cleanup double / + for i in range(len(form_index_list)): + form_index_list[i] = form_index_list[i].replace("//", "/") + + # Log exit + logger.info("Successfully located {0} form index files for {1}".format(len(form_index_list), year)) + + # Return + return form_index_list + + def list_index(min_year: int = 1950, max_year: int = 2050): """ Get the list of form index files on SEC HTTP. diff --git a/lexpredict_openedgar/openedgar/clients/local.py b/lexpredict_openedgar/openedgar/clients/local.py index 2a7fa62..ff2f053 100644 --- a/lexpredict_openedgar/openedgar/clients/local.py +++ b/lexpredict_openedgar/openedgar/clients/local.py @@ -26,12 +26,6 @@ # Setup logger logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) -console = logging.StreamHandler() -console.setLevel(logging.INFO) -formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s') -console.setFormatter(formatter) -logger.addHandler(console) class LocalClient: @@ -47,10 +41,12 @@ def put_buffer(self, file_path: str, buffer, write_bytes=True): if not os.path.exists(dir_name): os.makedirs(dir_name) if write_bytes: - mode="wb" + mode = "wb" + encoding = None else: - mode="w" - with open(file_path, mode=mode) as localfile: + mode = "w" + encoding = "utf-8" + with open(file_path, mode=mode, encoding=encoding) as localfile: localfile.write(buffer) def get_buffer(self, file_path: str): diff --git a/lexpredict_openedgar/openedgar/clients/s3.py b/lexpredict_openedgar/openedgar/clients/s3.py index 367f3aa..34e762e 100755 --- a/lexpredict_openedgar/openedgar/clients/s3.py +++ b/lexpredict_openedgar/openedgar/clients/s3.py @@ -38,12 +38,6 @@ # Setup logger logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) -console = logging.StreamHandler() -console.setLevel(logging.INFO) -formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s') -console.setFormatter(formatter) -logger.addHandler(console) class S3Client: diff --git a/lexpredict_openedgar/openedgar/parsers/edgar.py b/lexpredict_openedgar/openedgar/parsers/edgar.py index 06a31c8..2e1dcc5 100755 --- a/lexpredict_openedgar/openedgar/parsers/edgar.py +++ b/lexpredict_openedgar/openedgar/parsers/edgar.py @@ -44,12 +44,6 @@ # Setup logger logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) -console = logging.StreamHandler() -console.setLevel(logging.INFO) -formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s') -console.setFormatter(formatter) -logger.addHandler(console) def uudecode(buffer: Union[bytes, str]): @@ -125,7 +119,6 @@ def parse_index_file(file_name: str, double_gz: bool = False): with gzip.open(file_name, "rb") as index_file: index_buffer = index_file.read() except IOError as e: - logger.error("IOError parsing {0}: {1}".format(file_name, e)) # Read as plain binary with open(file_name, "rb") as index_file: index_buffer = index_file.read() @@ -135,6 +128,8 @@ def parse_index_file(file_name: str, double_gz: bool = False): index_buffer = zlib.decompress(index_buffer).decode("utf-8") logger.info("gz with valid header: decompressing {0} to {1} bytes.".format(file_name, len(index_buffer))) + else: + logger.info("IOError parsing {0}: {1}".format(file_name, e)) # Check for double-gz if double_gz: @@ -218,6 +213,29 @@ def extract_filing_header_field(buffer: Union[bytes, str], field: str): return buffer[p0:p1].strip() +def decode_filing(to_decode: Union[bytes, str]) -> Union[str, None]: + buffer = to_decode + if isinstance(buffer, bytes): + try: + # Start with UTF-8 + buffer = str(buffer.decode("utf-8")) + except UnicodeDecodeError as _: + try: + # Fallback to ISO 8859-1 + logger.warning("Falling back to ISO 8859-1 after failing to decode with UTF-8...") + buffer = str(buffer.decode("iso-8859-1")) + except UnicodeDecodeError as _: + try: + # Fallback to ISO 8859-15 + logger.warning("Falling back to ISO 8859-15 after failing to decode with UTF-8...") + buffer = str(buffer.decode("iso-8859-5")) + except UnicodeDecodeError as _: + # Give up if we can't + logger.error("Unable to decode with either UTF-8 or ISO 8859-1; giving up...") + return None + return buffer + + def parse_filing(buffer: Union[bytes, str], extract: bool = False): """ Parse a filing file by returning each document within @@ -243,18 +261,9 @@ def parse_filing(buffer: Union[bytes, str], extract: bool = False): # Typing if isinstance(buffer, bytes): - try: - # Start with UTF-8 - buffer = str(buffer.decode("utf-8")) - except UnicodeDecodeError as _: - try: - # Fallback to ISO 8859-1 - logger.warning("Falling back to ISO 8859-1 after failing to decode with UTF-8...") - buffer = str(buffer.decode("iso-8859-1")) - except UnicodeDecodeError as _: - # Give up if we can't - logger.error("Unable to decode with either UTF-8 or ISO 8859-1; giving up...") - return filing_data + buffer = decode_filing(buffer) + if buffer is None: + return filing_data # Check for SEC-HEADER block if "" in buffer or "" in buffer: diff --git a/lexpredict_openedgar/openedgar/process_text.py b/lexpredict_openedgar/openedgar/process_text.py new file mode 100644 index 0000000..9e9244a --- /dev/null +++ b/lexpredict_openedgar/openedgar/process_text.py @@ -0,0 +1,49 @@ +from typing import Union + +from bs4 import BeautifulSoup +import re + + +def html_to_text(html_doc: str) -> str: + """ + Convert html/xml to the pure text + :param html_doc: the document to convert + :return: + """ + soup = BeautifulSoup(html_doc, 'html.parser') + if soup.find('xbrl'): + return xbr(soup) + else: + return not_xbrl(soup) + + +def not_xbrl(soup: BeautifulSoup) -> str: + """ + Handle format not in XBLR + :param soup: + :return: + """ + doc = '' + for string in soup.strings: + string = string.strip() + if len(string) > 0: + doc += string + '\n' + return doc + + +def xbr(soup: BeautifulSoup) -> str: + """ + Handle XBLR format for extracting all the text fields + :param soup: + :return: + """ + doc = "" + for tag in soup.find_all(re.compile("[T,t]ext|[D,d]escription")): + if tag.string is not None: + inner_soup = BeautifulSoup(tag.string, 'html.parser') + string = "" + for string_inner in inner_soup.strings: + if string_inner is not None: + string = string + " " + string_inner + doc += string + '\n' + return doc diff --git a/lexpredict_openedgar/openedgar/processes/edgar.py b/lexpredict_openedgar/openedgar/processes/edgar.py index fa4b082..582e423 100755 --- a/lexpredict_openedgar/openedgar/processes/edgar.py +++ b/lexpredict_openedgar/openedgar/processes/edgar.py @@ -28,6 +28,9 @@ import os # Project import openedgar.clients.edgar +from config.settings.base import DOWNLOAD_PATH +from openedgar.clients.adl import ADLClient +from openedgar.clients.blob import BlobClient from openedgar.clients.s3 import S3Client from openedgar.clients.local import LocalClient import openedgar.clients.local @@ -37,37 +40,42 @@ # Logging setup logger = logging.getLogger(__name__) -logger.setLevel(logging.ERROR) -console = logging.StreamHandler() -console.setLevel(logging.ERROR) -formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s') -console.setFormatter(formatter) -logger.addHandler(console) -def download_filing_index_data(year: int = None): +def download_filing_index_data(year: int = None, quarter: int = None, month: int = None): """ Download all filing index data. + :param month: + :param quarter: :param year: :return: """ # Get filing index list if year is not None: - filing_index_list = openedgar.clients.edgar.list_index_by_year(year) + if month is not None: + filing_index_list = openedgar.clients.edgar.list_index_by_month(year, month) + elif quarter is not None: + filing_index_list = openedgar.clients.edgar.list_index_by_quarter(year, quarter) + else: + filing_index_list = openedgar.clients.edgar.list_index_by_year(year) else: filing_index_list = openedgar.clients.edgar.list_index() path_list = [] configured_client = os.environ["CLIENT_TYPE"] - logger.info(msg="Configured client is: {}".format(configured_client)) + logger.info("Configured client is: {}".format(configured_client)) path_prefix = str() if configured_client is None or configured_client == "S3": # Create S3 client download_client = S3Client() + elif configured_client == "ADL": + download_client = ADLClient() + elif configured_client == "Blob": + download_client = BlobClient() else: download_client = LocalClient() - path_prefix = os.environ["DOWNLOAD_PATH"] + path_prefix = DOWNLOAD_PATH # Now iterate through list to check if already on S3 for filing_index_path in filing_index_list: @@ -81,10 +89,10 @@ def download_filing_index_data(year: int = None): try: filing_index = FilingIndex.objects.get(edgar_url=filing_index_path) is_processed = filing_index.is_processed - logger.info("Index {0} already exists in DB.".format(filing_index_path)) + logger.debug("Index {0} already exists in DB.".format(filing_index_path)) except FilingIndex.DoesNotExist: is_processed = False - logger.info("Index {0} does not exist in DB.".format(filing_index_path)) + logger.debug("Index {0} does not exist in DB.".format(filing_index_path)) # Check if exists; download and upload to S3 if missing if not download_client.path_exists(file_path): @@ -94,19 +102,19 @@ def download_filing_index_data(year: int = None): # Upload download_client.put_buffer(file_path, buffer) - logger.info("Retrieved {0} and uploaded to S3.".format(filing_index_path)) + logger.debug("Retrieved {0} and uploaded to S3.".format(filing_index_path)) path_list.append((file_path, True, is_processed)) else: - logger.info("Index {0} already exists on S3.".format(filing_index_path)) + logger.debug("Index {0} already exists on S3.".format(filing_index_path)) path_list.append((file_path, False, is_processed)) # Return list of updates return path_list -def process_all_filing_index(year: int = None, form_type_list: Iterable[str] = None, new_only: bool = False, - store_raw: bool = True, - store_text: bool = True): +def process_all_filing_index(year: int = None, quarter: int = None, month: int = None, + form_type_list: Iterable[str] = None, new_only: bool = False, + store_raw: bool = True, store_text: bool = True, store_processed: bool = True): """ Process all filing index data. :type year: optional year to process @@ -114,10 +122,11 @@ def process_all_filing_index(year: int = None, form_type_list: Iterable[str] = N :param new_only: :param store_raw: :param store_text: + :param store_processed: :return: """ # Get the list of file paths - file_path_list = download_filing_index_data(year) + file_path_list = download_filing_index_data(year, quarter, month) client_type = os.environ["CLIENT_TYPE"] or "S3" @@ -127,11 +136,11 @@ def process_all_filing_index(year: int = None, form_type_list: Iterable[str] = N if new_only and not is_processed: logger.info("Processing filing index for {0}...".format(s3_path)) _ = process_filing_index.delay(client_type, s3_path, form_type_list=form_type_list, store_raw=store_raw, - store_text=store_text) + store_text=store_text, store_processed=store_processed) elif not new_only: logger.info("Processing filing index for {0}...".format(s3_path)) _ = process_filing_index.delay(client_type, s3_path, form_type_list=form_type_list, store_raw=store_raw, - store_text=store_text) + store_text=store_text, store_processed=store_processed) else: logger.info("Skipping process_filing_index for {0}...".format(s3_path)) @@ -179,7 +188,7 @@ def search_filing_documents(term_list: Iterable[str], form_type_list: Iterable[s stem_search=stem_search) n += 1 - logger.info("Searching {0} documents for {1} terms...".format(n, len(term_list))) + logger.debug("Searching {0} documents for {1} terms...".format(n, len(term_list))) def export_filing_document_search(search_query_id: int, output_file_path: str): diff --git a/lexpredict_openedgar/openedgar/processes/s3.py b/lexpredict_openedgar/openedgar/processes/s3.py index 3e3cee1..d5cdd9a 100755 --- a/lexpredict_openedgar/openedgar/processes/s3.py +++ b/lexpredict_openedgar/openedgar/processes/s3.py @@ -31,12 +31,6 @@ # Setup logger logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) -console = logging.StreamHandler() -console.setLevel(logging.INFO) -formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s') -console.setFormatter(formatter) -logger.addHandler(console) def is_access_denied_file(remote_path: str, client=None): diff --git a/lexpredict_openedgar/openedgar/tasks.py b/lexpredict_openedgar/openedgar/tasks.py index e4b2380..1da38c2 100755 --- a/lexpredict_openedgar/openedgar/tasks.py +++ b/lexpredict_openedgar/openedgar/tasks.py @@ -26,9 +26,11 @@ import datetime import hashlib import logging +import sys import tempfile import os import pathlib +import traceback from typing import Iterable, Union # Packages @@ -37,7 +39,9 @@ from celery import shared_task # Project -from config.settings.base import S3_DOCUMENT_PATH +from config.settings.base import S3_DOCUMENT_PATH, DOWNLOAD_PATH +from openedgar.clients.adl import ADLClient +from openedgar.clients.blob import BlobClient from openedgar.clients.s3 import S3Client from openedgar.clients.local import LocalClient import openedgar.clients.edgar @@ -49,16 +53,13 @@ import lexnlp.nlp.en.tokens # Logging setup +from openedgar.process_text import html_to_text + logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) -console = logging.StreamHandler() -console.setLevel(logging.INFO) -formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s') -console.setFormatter(formatter) -logger.addHandler(console) -def create_filing_documents(client, documents, filing, store_raw: bool = True, store_text: bool = True): +def create_filing_documents(client, documents, filing, store_raw: bool = True, store_text: bool = True, + store_processed=True): """ Create filing document records given a list of documents and a filing record. @@ -66,28 +67,14 @@ def create_filing_documents(client, documents, filing, store_raw: bool = True, s :param filing: Filing record :param store_raw: whether to store raw contents :param store_text: whether to store text contents + :param store_processed: whether to extract the text from the contents :return: """ # Get client if we're using S3 - # Iterate through documents document_records = [] for document in documents: - # Create DB object - filing_doc = FilingDocument() - filing_doc.filing = filing - filing_doc.type = document["type"] - filing_doc.sequence = document["sequence"] - filing_doc.file_name = document["file_name"] - filing_doc.content_type = document["content_type"] - filing_doc.description = document["description"] - filing_doc.sha1 = document["sha1"] - filing_doc.start_pos = document["start_pos"] - filing_doc.end_pos = document["end_pos"] - filing_doc.is_processed = True - filing_doc.is_error = len(document["content"]) > 0 - document_records.append(filing_doc) # Upload raw if requested if store_raw and len(document["content"]) > 0: @@ -111,6 +98,32 @@ def create_filing_documents(client, documents, filing, store_raw: bool = True, s logger.info("Text contents for filing={0}, sequence={1}, sha1={2} already exists on S3" .format(filing, document["sequence"], document["sha1"])) + if store_processed and document["content_text"] is not None: + text_path = pathlib.Path(S3_DOCUMENT_PATH, "processed", document["sha1"]).as_posix() + if not client.path_exists(text_path): + text = html_to_text(document["content_text"]) + client.put_buffer(text_path, text, write_bytes=False) + logger.info("Processed text contents for filing={0}, sequence={1}, sha1={2}" + .format(filing, document["sequence"], document["sha1"])) + else: + logger.info("Processed text contents for filing={0}, sequence={1}, sha1={2} already exists" + .format(filing, document["sequence"], document["sha1"])) + + # Create DB object + filing_doc = FilingDocument() + filing_doc.filing = filing + filing_doc.type = document["type"] + filing_doc.sequence = document["sequence"] + filing_doc.file_name = document["file_name"] + filing_doc.content_type = document["content_type"] + filing_doc.description = document["description"] + filing_doc.sha1 = document["sha1"] + filing_doc.start_pos = document["start_pos"] + filing_doc.end_pos = document["end_pos"] + filing_doc.is_processed = True + filing_doc.is_error = len(document["content"]) > 0 + document_records.append(filing_doc) + # Create in bulk FilingDocument.objects.bulk_create(document_records) return len(document_records) @@ -187,14 +200,17 @@ def create_filing_error(row, filing_path: str): @shared_task def process_filing_index(client_type: str, file_path: str, filing_index_buffer: Union[str, bytes] = None, - form_type_list: Iterable[str] = None, store_raw: bool = False, store_text: bool = False): + form_type_list: Iterable[str] = None, store_raw: bool = False, store_text: bool = False, + store_processed: bool = False): """ Process a filing index from an S3 path or buffer. + :param client_type: :param file_path: S3 or local path to process; if filing_index_buffer is none, retrieved from here :param filing_index_buffer: buffer; if not present, s3_path must be set :param form_type_list: optional list of form type to process :param store_raw: :param store_text: + :param store_processed: :return: """ # Log entry @@ -202,6 +218,10 @@ def process_filing_index(client_type: str, file_path: str, filing_index_buffer: if client_type == "S3": client = S3Client() + elif client_type == "ADL": + client = ADLClient() + elif client_type == "Blob": + client = BlobClient() else: client = LocalClient() @@ -209,6 +229,9 @@ def process_filing_index(client_type: str, file_path: str, filing_index_buffer: if filing_index_buffer is None: logger.info("Retrieving filing index buffer for: {}...".format(file_path)) filing_index_buffer = client.get_buffer(file_path) + if filing_index_buffer is None: + logger.error("SOMETHING WRONG! FILING IS NONE! Client {} file {}".format(client_type, file_path)) + raise ValueError("Filing index is none!") # Write to disk to handle headaches temp_file = tempfile.NamedTemporaryFile(delete=False) @@ -217,7 +240,7 @@ def process_filing_index(client_type: str, file_path: str, filing_index_buffer: # Get main filing data structure filing_index_data = openedgar.parsers.edgar.parse_index_file(temp_file.name) - logger.info("Parsed {0} records from index".format(filing_index_data.shape[0])) + logger.warning("Parsed {0} records from index".format(filing_index_data.shape[0])) # Iterate through rows bad_record_count = 0 @@ -240,9 +263,10 @@ def process_filing_index(client_type: str, file_path: str, filing_index_buffer: logger.info("Filing record already exists: {0}".format(filing)) except Filing.MultipleObjectsReturned as e: # Create new filing record - logger.error("Multiple Filing records found for s3_path={0}, skipping...".format(filing_path)) - logger.info("Raw exception: {0}".format(e)) - continue + logger.warning("Multiple Filing records found for s3_path={0} .. Taking first one!".format(filing_path)) + filing = Filing.objects.filter(s3_path=filing_path).first() + logger.info("Filing record already exists: {0}".format(filing)) + logger.debug("Raw exception: {0}".format(e)) except Filing.DoesNotExist as f: # Create new filing record logger.info("No Filing record found for {0}, creating...".format(filing_path)) @@ -260,7 +284,7 @@ def process_filing_index(client_type: str, file_path: str, filing_index_buffer: continue # Upload - client.put_buffer(filing_path, filing_buffer) + client.put_buffer("{}/{}".format(DOWNLOAD_PATH,filing_path), filing_buffer) logger.info("Downloaded from EDGAR and uploaded to {}...".format(client_type)) else: @@ -269,7 +293,8 @@ def process_filing_index(client_type: str, file_path: str, filing_index_buffer: filing_buffer = client.get_buffer(filing_path) # Parse - filing_result = process_filing(client, filing_path, filing_buffer, store_raw=store_raw, store_text=store_text) + filing_result = process_filing(client, filing_path, filing_buffer, store_raw=store_raw, + store_text=store_text, store_processed=store_processed) if filing_result is None: logger.error("Unable to process filing.") bad_record_count += 1 @@ -303,19 +328,19 @@ def process_filing_index(client_type: str, file_path: str, filing_index_buffer: @shared_task def process_filing(client, file_path: str, filing_buffer: Union[str, bytes] = None, store_raw: bool = False, - store_text: bool = False): + store_text: bool = False, store_processed: bool = False): """ Process a filing from a path or filing buffer. :param file_path: path to process; if filing_buffer is none, retrieved from here :param filing_buffer: buffer; if not present, s3_path must be set :param store_raw: :param store_text: + :param store_processed :return: """ # Log entry logger.info("Processing filing {0}...".format(file_path)) - # Check for existing record first try: filing = Filing.objects.get(s3_path=file_path) @@ -332,6 +357,10 @@ def process_filing(client, file_path: str, filing_buffer: Union[str, bytes] = No if filing_buffer is None: logger.info("Retrieving filing buffer from S3...") filing_buffer = client.get_buffer(file_path) + filing_buffer = openedgar.parsers.edgar.decode_filing(filing_buffer) + if filing_buffer is None: + logger.error("Unable to read the filing {}".format(file_path)) + return None # Get main filing data structure filing_data = openedgar.parsers.edgar.parse_filing(filing_buffer, extract=store_text) @@ -409,13 +438,19 @@ def process_filing(client, file_path: str, filing_buffer: Union[str, bytes] = No # Create filing document records try: - create_filing_documents(client, filing_data["documents"], filing, store_raw=store_raw, store_text=store_text) + create_filing_documents(client, filing_data["documents"], filing, store_raw=store_raw, store_text=store_text, + store_processed=store_processed) filing.is_processed = True filing.is_error = False filing.save() return filing except Exception as e: # pylint: disable=broad-except - logger.error("Unable to create filing documents for {0}: {1}".format(filing, e)) + logger.error("423: Unable to create filing documents for {0}: {1}qGG".format(filing, e)) + exc_type, exc_value, exc_tb = sys.exc_info() + ex_txt = "" + for line in traceback.TracebackException(type(exc_type), exc_value, exc_tb).format(chain=None): + ex_txt += line + "\n" + logger.error(ex_txt) return None @@ -429,8 +464,6 @@ def extract_filing(client, file_path: str, filing_buffer: Union[str, bytes] = No """ # Get buffer - - if filing_buffer is None: logger.info("Retrieving filing buffer from S3...") filing_buffer = client.get_buffer(file_path) diff --git a/lexpredict_openedgar/requirements/full.txt b/lexpredict_openedgar/requirements/full.txt index f34945d..19e16c6 100755 --- a/lexpredict_openedgar/requirements/full.txt +++ b/lexpredict_openedgar/requirements/full.txt @@ -135,4 +135,5 @@ Werkzeug==0.14.1 whitenoise==3.3.1 widgetsnbextension==3.2.1 wrapt==1.10.11 +beautifulsoup4==4.7.1 https://github.com/LexPredict/lexpredict-lexnlp/archive/0.1.8.zip \ No newline at end of file diff --git a/tika/download_tika.sh b/tika/download_tika.sh index 7fcce88..53910ea 100644 --- a/tika/download_tika.sh +++ b/tika/download_tika.sh @@ -1 +1,2 @@ -wget http://www-us.apache.org/dist/tika/tika-server-1.18.jar +#!/bin/bash +wget http://www-us.apache.org/dist/tika/tika-server-1.21.jar