Unverified Commit 0ea279e9 authored by Dhiraj Suthar's avatar Dhiraj Suthar Committed by GitHub
Browse files

Merge pull request #1 from dileep-gadiraju/develop

addition: main_server_code, scripts, docs
No related merge requests found
Showing with 476 additions and 0 deletions
+476 -0
# scrapy config goes here !
\ No newline at end of file
src/config.py 0 → 100644
import os
# ------------------server configuration--------------------------
SERVER_HOST = '0.0.0.0'
SERVER_PORT = 5001
SERVER_DEBUG = True
SERVER_CORS = False
SERVER_STATIC_PATH = ''
# API configuration
API_URL_PREFIX = "/general"
BUILD_NUMBER = 'BUILD_NUMBER_001'
API_MANDATORY_PARAMS = ['agentId', 'search', 'type']
# Application configuration
BASIC_HTTP_USERNAME = os.environ.get('BASIC_HTTP_USERNAME')
BASIC_HTTP_PASSWORD = os.environ.get('BASIC_HTTP_PASSWORD')
# ------------------agent configuration---------------------------
# AGENT_SCRIPT_TYPES = { 'JOB_TYPE_1' : 'JOB_TYPE_1_FOLDER', 'JOB_TYPE_2' : 'JOB_TYPE_2_FOLDER' }
AGENT_SCRIPT_TYPES = {
'INFORMATION': 'info',
'PDF': 'pdf'
}
# agent configuration file
AGENT_CONFIG_PATH = 'agent_configs/agents.json'
AGENT_CONFIG_PKL_PATH = 'agent_configs/agents.pkl'
# ------------------AzureBlob Variable----------------------------
# AzureBlob variable
BLOB_INTIGRATION = False
BLOB_SAS_TOKEN = os.environ.get('BLOB_SAS_TOKEN')
BLOB_ACCOUNT_URL = os.environ.get('BLOB_ACCOUNT_URL')
BLOB_CONTAINER_NAME = os.environ.get('CONTAINER_NAME')
# ------------------Queuing variables-----------------------------
# Queuing variables
MAX_RUNNING_JOBS = int(os.environ.get('MAX_RUNNING_JOBS', 4))
MAX_WAITING_JOBS = int(os.environ.get('MAX_WAITING_JOBS', 10))
# ------------------ElasticSearch DB variables--------------------
ELASTIC_DB_URL = os.environ.get('ELASTIC_DB_URL')
# ES index variables
ES_LOG_INDEX = 'general-app-logs'
ES_JOB_INDEX = 'general-job-stats'
ES_DATA_INDEX = 'general-crawled-data'
# ------------------Logging variables-----------------------------
JOB_OUTPUT_PATH = "output"
# JobStatus variables
JOB_RUNNING_STATUS = 'RUNNING'
JOB_COMPLETED_SUCCESS_STATUS = 'COMPLETED_SUCCESS'
JOB_COMPLETED_FAILED_STATUS = 'COMPLETED_FAILED'
# ------------------Driver Variables-------------------------------
CHROMEDRIVER_PATH = 'C:\\Drivers\\chromedriver_win32\\chromedriver.exe'
# -----------------------------------------------------------------
from .response import CustomResponse
from .status import Status
from .job import JobModel
from .agent_utils import AgentUtils
class Agent(object):
def __init__(self, agentId, description, provider, scripts, URL):
self.provider = provider
self.description = description
self.agentId = agentId
self.scripts = scripts
self.URL = URL
@property
def agentId(self):
return self._agentId
@agentId.setter
def agentId(self, value):
self._agentId = value
@property
def description(self):
return self._description
@description.setter
def description(self, value):
self._description = value
@property
def provider(self):
return self._provider
@provider.setter
def provider(self, value):
self._provider = value
@property
def scripts(self):
return self._scripts
@scripts.setter
def scripts(self, value):
self._scripts = value
@property
def URL(self):
return self._URL
@URL.setter
def URL(self, value):
self._URL = value
def __str__(self):
str_1 = 'id: {0} , description: {1} , provider: {2} , scripts: {3} , URL: {4}'
str_1 = str_1.format(self.agentId, self.description,
self.provider, self.scripts, self.URL)
return str_1
import os
import pickle
from .agent_class import Agent
class AgentUtils:
def __init__(self):
self.filepath = None
@property
def filepath(self):
return self._filepath
@filepath.setter
def filepath(self, value):
self._filepath = value
def __readPklFile(self):
if os.path.exists(self.filepath):
file_pi = open(self.filepath, 'rb')
agent_list = pickle.load(file_pi)
return agent_list
else:
return []
def __writePklFile(self, agent_list):
file_pi = open(self.filepath, 'wb')
pickle.dump(agent_list, file_pi)
def addAgent(self, agentId, description, provider, scripts, URL):
agent = Agent(agentId, description, provider, scripts, URL)
agent_list = self.__readPklFile()
for old_agent in agent_list:
if old_agent.agentId == agent.agentId:
print('The agent already exists', agent)
return
agent_list.append(agent)
self.__writePklFile(agent_list)
def listAgents(self):
return_list = []
agent_list = self.__readPklFile()
for old_agent in agent_list:
agent = {}
agent['agentId'] = old_agent.agentId
agent['description'] = old_agent.description
agent['provider'] = old_agent.provider
agent['scripts'] = old_agent.scripts
agent['URL'] = old_agent.URL
return_list.append(agent)
return return_list
from common import Log
class JobModel(object):
def status(self, jobId):
'''
connect to ES DB and get the status of jobId
'''
log = Log.from_default()
return log.get_status(jobId)
from flask import jsonify
class CustomResponse:
def __init__(self, statuscode, data):
self.statuscode = statuscode
self.statuscode['data'] = data
def getres(self):
return jsonify(self.statuscode)
def getresjson(self):
return self.statuscode
import enum
class Status(enum.Enum):
SUCCESS = {
'ok': True,
'http': {'status': 200},
'why': "request successful"
}
FAILURE = {
'ok': False,
'http': {'status': 500},
'why': 'request failed'
}
ERR_SYSTEM = {
'ok': False,
'http': {'status': 500},
'why': "Internal Server Error"
}
ERR_INVALID_DATA = {
'ok': False,
'http': {'status': 400},
'why': "Invalid Data"
}
ERR_MISSING_PARAMETERS = {
'ok': False,
'http': {'status': 400},
'why': "Data Missing"
}
CORRUPT_FILE = {
'ok': False,
'http': {'status': 500},
'why': 'uploaded file is corrupt'
}
DATA_NOT_FOUND = {
'ok': False,
'http': {'status': 404},
'why': 'data not found'
}
OPERATION_NOT_PERMITTED = {
'ok': False,
'http': {'status': 400},
'why': 'operation not permitted'
}
ERR_GATEWAY = {
'ok': False,
'http': {'status': 400},
'why': 'gateway error'
}
ERR_NOTFOUND_FILE = {
'ok': False,
'http': {'status': 400},
'why': 'file not found'
}
ERR_SCHEMA_VALIDATION = {
'ok': False,
'http': {'status': 400},
'why': 'please refer api contract to check your request structure'
}
from .agent import AgentRepo
from .job import JobRepo
import uuid
from concurrent.futures import ThreadPoolExecutor
import config
from common.elastic_wrapper import Log
from models import AgentUtils
class AgentRepo:
def __init__(self):
self.agentUtils = AgentUtils()
self.executor = ThreadPoolExecutor(max_workers=config.MAX_RUNNING_JOBS)
def list(self, filepath):
self.agentUtils.filepath = filepath
result = self.agentUtils.listAgents()
for agent in result:
agent.pop('scripts')
return result
def run(self, agentRunContext, filepath):
threadStarted = False
agentRunContext.jobId = str(uuid.uuid4())
self.agentUtils.filepath = filepath
agents_list = self.agentUtils.listAgents()
threadStarted = False
for agent in agents_list:
if agent['agentId'] == agentRunContext.requestBody['agentId']:
agentRunContext.URL = agent['URL']
threadStarted = True
if self.executor._work_queue.qsize() < config.MAX_WAITING_JOBS:
log = Log(agentRunContext)
log.job(config.JOB_RUNNING_STATUS, "JOB in waiting state.")
del log
self.executor.submit(
agent['scripts'][config.AGENT_SCRIPT_TYPES[agentRunContext.jobType]], agentRunContext)
else:
return {'message': 'Already many jobs are in Waiting ... Please retry after some time.'}
if threadStarted:
return {'jobId': agentRunContext.jobId}
else:
return None
from models import JobModel
class JobRepo:
def __init__(self):
self.jobModel = JobModel()
def status(self, jobId):
return self.jobModel.status(jobId)
elasticsearch==8.0.0
Flask==1.1.2
Jinja2==2.11.3
MarkupSafe==1.1.1
Werkzeug==1.0.1
itsdangerous==1.1.0
Flask-Cors==3.0.10
Flask-RESTful==0.3.9
uuid==1.30
selenium==4.2.0
Flask-BasicAuth==0.2.0
Flask-HTTPBasicAuth==1.0.1
pandas==1.4.2
python-dateutil==2.8.1
beautifulsoup4==4.9.3
azure-storage-blob==12.10.0b1
scrapy==2.6.1
from .agent import AgentListResource, AgentRunResource
from .job import JobStatusResource
import os
import traceback
import config
from app import basic_auth
from common import ValueMissing
from flask import request
from flask_restful import Resource
from models import CustomResponse, Status
from repositories import AgentRepo
from utilities import AgentRunContext
agentRepo = AgentRepo()
def mandatory_param(req):
e_value = Status.ERR_MISSING_PARAMETERS
param_list = list()
for param in config.API_MANDATORY_PARAMS:
if req.get(param) is None:
param_list.append(param)
if len(param_list) > 0:
return ",".join(param_list), e_value
else:
return None, e_value
def check_job_type(req):
e_value = Status.ERR_INVALID_DATA
if req.get('type') in config.AGENT_SCRIPT_TYPES.keys():
return req.get('type'), e_value
else:
return None, e_value
class AgentListResource(Resource):
@basic_auth.required
def get(self):
try:
result = agentRepo.list(os.path.join(
config.SERVER_STATIC_PATH, config.AGENT_CONFIG_PKL_PATH))
if result != None:
res = CustomResponse(Status.SUCCESS.value, result)
return res.getres()
else:
res = CustomResponse(
Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None)
return res.getresjson(), 400
except Exception:
res = CustomResponse(
Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None)
return res.getresjson(), 400
class AgentRunResource(Resource):
@basic_auth.required
def post(self):
try:
req = request.get_json()
# check mandatory params
miss, e_value = mandatory_param(req)
if miss is not None:
raise ValueMissing(miss+' - mandatory')
# check if valid JOB_TYPE
miss, e_value = check_job_type(req)
if miss is None:
raise ValueMissing('invalid type')
agentRunContext = AgentRunContext(req, miss)
result = agentRepo.run(agentRunContext, os.path.join(
config.SERVER_STATIC_PATH, config.AGENT_CONFIG_PKL_PATH))
if result != None:
res = CustomResponse(Status.SUCCESS.value, result)
return res.getres()
else:
res = CustomResponse(
Status.ERR_GLOBAL_INVALID_DATA.value, "Invalid Agent ID")
return res.getresjson(), 400
except Exception as e:
print(traceback.format_exc())
res = CustomResponse(e_value.value, str(e))
return res.getresjson(), 400
from app import basic_auth
from flask import request
from flask_restful import Resource
from models import CustomResponse, Status
from repositories import JobRepo
jobRepo = JobRepo()
class JobStatusResource(Resource):
@basic_auth.required
def get(self):
try:
result = jobRepo.status(request.args.get('jobId'))
if result != None:
res = CustomResponse(Status.SUCCESS.value, result)
return res.getres()
else:
res = CustomResponse(
Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None)
return res.getresjson(), 400
except Exception as e:
print(e)
res = CustomResponse(
Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None)
return res.getresjson(), 400
from .agent import AGENT_BLUEPRINT
from .job import JOB_BLUEPRINT
from flask import Blueprint
from flask_restful import Api
from resources import AgentListResource, AgentRunResource
AGENT_BLUEPRINT = Blueprint("agent", __name__)
Api(AGENT_BLUEPRINT).add_resource(
AgentListResource, "/agents"
)
Api(AGENT_BLUEPRINT).add_resource(
AgentRunResource, "/run"
)
from flask import Blueprint
from flask_restful import Api
from resources import JobStatusResource
JOB_BLUEPRINT = Blueprint("job", __name__)
Api(JOB_BLUEPRINT).add_resource(
JobStatusResource, "/status"
)
from .info import *
from .pdf import *
# Scrapy
from .applied_scrapy import AppliedScrapy
from .grainger_scrapy import GraingerScrapy
# Selenium
from .applied_selenium import AppliedSelenium
from .grainger_selenium import GraingerSelenium
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment