refactor: proxy, agentContext, chromeLock

53b4b040 · pushkar191098 · 9c23e379 · 53b4b040 · 53b4b040 · 53b4b040
Commit 53b4b040 authored 2 years ago by pushkar191098
Hide whitespace changes
Inline Side-by-side

Showing

with 551 additions and 263 deletions
+551 -263
--- a/docs/proxy.md
+++ b/docs/proxy.md
+## Proxy Setup
+
+#### Requirements: 
+[Squid](https://ubuntu.com/server/docs/proxy-servers-squid)
+
+In Linux, enter the following commands to install Squid
+
+```
+sudo apt-get update
+sudo apt-get install squid
+```
+
+#### To create the basic_auth to protect the proxy server
+
+```
+sudo touch /etc/squid/passwords
+sudo chmod 777 /etc/squid/passwords
+sudo htpasswd -c /etc/squid/passwords [USERNAME]
+```
+
+Replace [USERNAME] with your username. You will be prompted for entering the password. Enter and confirm it. 
+
+#### To Test the password store 
+
+`/usr/lib/squid3/basic_ncsa_auth /etc/squid/passwords`
+
+After executing this line the console will look like its hung, there is a prompt without any text in it. 
+
+Enter USERNAME PASSWORD (replacing these with your specific username and password) and hit return. 
+
+You should receive the response "OK".
+
+If not, review the error message, your username/password might be incorrect. Its also possible basic_ncsa_auth is located on a different path (e.g. lib64).
+
+#### To configure the Squid configuration file 
+The Squid configuration file is found at `/etc/squid/squid.conf`.
+
+Update the file with below configuration
+
+```
+auth_param basic program /usr/lib/squid/basic_ncsa_auth /etc/squid/passwords
+#auth_param basic realm Squid proxy-caching web server
+auth_param basic realm proxy
+auth_param basic credentialsttl 2400 hours
+auth_param basic casesensitive off
+acl authenticated proxy_auth REQUIRED
+http_access allow authenticated
+dns_v4_first on
+forwarded_for delete
+via off
+http_access deny all
+```
+
+Save the file and exit. Once completed restart the squid service
+
+```
+sudo systemctl restart squid
+```
+
+---
+
+## Proxy VM Requirement
+
+
+Server Specification
+
+* Instance Type: Standard B2s (2 vcpus, 4 GiB memory)
+
+* Region : as-required* (eg: Sweden-Central)
+
+>>>Note: A proxy VM needs to be created in the geo-region of agent websites as needed.
+---
+
+## Crawler Architectural changes for Geo-restricted websites
+
+#### proxy Configuration
+
+Proxy Configuration lies in `/src/proxy_config.yml`
+
+syntax:
+
+```
+proxies:
+
+  # proxy with basic_auth (private proxy)
+  COUNTRY1:
+    URL: "{IP-ADDR}:{PORT}"
+    username: "{BASIC_AUTH_USERNAME}"
+    password: "{BASIC_AUTH_PASSWORD}"
+    
+  # proxy with no_auth (public proxy)
+  COUNTRY2:
+    URL: "{IP-ADDR}:{PORT}"
+
+```
+example:
+
+```
+proxies:
+  SWEDEN:
+    URL: "123.12.12.3:8796"
+    username: "XXXX"
+    password: "YYYY"
+  FINLAND:
+    URL: "143.62.12.93:8116"
+  NORWAY:
+    URL: "12.45.6.1:8080"
+    username: "XYZ"
+    password: "ABC"
+
+```
+
+#### To set proxy to agent
+
+you can set proxy-server to agent in `/src/static/agent_configs/agents.json`
+
+syntax:
+
+```
+[
+    {
+        "agentId": "AGENT-XYX",
+        "description": "Crawler For Xyz",
+        "provider": "Xyz",
+        "URL": "https://xyz.com",
+        "proxy": "COUNTRY",
+        "scripts": {
+            "info": "Xyzcrawler",
+            "pdf": "Xyzcrawler"
+        }
+    }
+]
+```
+example:
+
+```
+[
+    {
+        "agentId": "RS-SCRAPY",
+        "description": "Crawler For RS Components",
+        "provider": "RS Components",
+        "URL": "https://in.rsdelivers.com",
+        "proxy": "FINLAND",
+        "scripts": {
+            "info": "RSScrapy",
+            "pdf": "NoScripts"
+        }
+    }
+]
+```
+
+From the above example, agent `MESTRO-SKELLEFTEA-KRAFT` will not use any proxy.
--- a/src/app.py
+++ b/src/app.py
@@ -27,9 +27,9 @@ with open(os.path.join(config.SERVER_STATIC_PATH, config.AGENT_CONFIG_PATH), 'r'
    agent_list = json.load(f)

 __import__("scripts")
-
 my_scripts = sys.modules["scripts"]

+# serialize agent config
 agentUtils = AgentUtils()
 agentUtils.filepath = os.path.join(
    config.SERVER_STATIC_PATH, config.AGENT_CONFIG_PKL_PATH)
@@ -41,11 +41,8 @@ for i in range(len(agent_list)-1, len(agent_list)-len_diff-1, -1):
    for type in config.AGENT_SCRIPT_TYPES.values():
        agent_script[type] = my_scripts.__dict__[
            type].__dict__[agent['scripts'][type]]
-    agentUtils.addAgent(agent['agentId'],
-                        agent['description'],
-                        agent['provider'],
-                        agent_script,
-                        agent['URL'])
+    agent['scripts'] = agent_script
+    agentUtils.addAgent(agent)


 # server CORS policy

--- a/src/common/__init__.py
+++ b/src/common/__init__.py
-from .scraping_utils import get_driver
+from .errors import *
 from .elastic_wrapper import Log
-from .errors import ValueMissing, FormatError, BadRequestError
 from .blob_storage import BlobStorage
+from .selenium_utils import get_driver
+from .scrapy_utils import CustomScrapyFilesItem, get_scrapy_settings
--- a/src/common/blob_storage.py
+++ b/src/common/blob_storage.py
@@ -5,7 +5,7 @@ from azure.storage.blob import BlobServiceClient


 class BlobStorage(object):
-    def __init__(self,overwrite=False):
+    def __init__(self, overwrite=False):
        self.blob_service_client = BlobServiceClient(
            account_url=config.BLOB_ACCOUNT_URL, credential=config.BLOB_SAS_TOKEN)
        self.root_folder = None
@@ -30,11 +30,12 @@ class BlobStorage(object):
    def set_agent_folder(self, agent_folder):
        self.root_folder = agent_folder

-    def upload_file(self,file_name,file_contents):
-        upload_file_path = os.path.join(self.root_folder,file_name)
-        blob_client = self.blob_service_client.get_blob_client(container=config.CONTAINER_NAME,blob=upload_file_path)
+    def upload_file(self, file_name, file_contents):
+        upload_file_path = os.path.join(self.root_folder, file_name)
+        blob_client = self.blob_service_client.get_blob_client(
+            container=config.CONTAINER_NAME, blob=upload_file_path)
        try:
-            blob_client.upload_blob(file_contents,overwrite=self.overwrite)
+            blob_client.upload_blob(file_contents, overwrite=self.overwrite)
        except Exception as e:
-            return False,str(e)
-        return True,'true'
+            return False, str(e)
+        return True, 'true'
--- a/src/common/elastic_wrapper.py
+++ b/src/common/elastic_wrapper.py
@@ -10,15 +10,17 @@ class Log(object):
    def from_default(cls):
        return cls(None)

-    def __init__(self, agentRunContext):
-        self.agentRunContext = agentRunContext
-        self.es_client = Elasticsearch([config.ELASTIC_DB_URL])
+    def __init__(self, agentContext):
+        self.agentContext = agentContext
+        self.es_client = Elasticsearch([config.ELASTIC_DB_URL], ca_certs=config.ELASTIC_DB_CERT, http_auth=[
+                                       config.ELASTIC_DB_USERNAME, config.ELASTIC_DB_PASSWORD])

    def __populate_context(self):
        data = {
-            'agentId': self.agentRunContext.requestBody['agentId'],
-            'jobId': self.agentRunContext.jobId,
-            'jobType': self.agentRunContext.jobType,
+            'agentId': self.agentContext.requestBody['agentId'],
+            'jobId': self.agentContext.jobId,
+            'jobType': self.agentContext.jobType,
+            'search': self.agentContext.requestBody['search'],
            'timestamp': int(time.time()*1000),
            'buildNumber': config.BUILD_NUMBER
        }

--- a/src/common/errors.py
+++ b/src/common/errors.py
@@ -15,94 +15,104 @@ class BadRequestError(RestAPIError):
        super().__init__(400, payload)


-class InternalServerErrorError(RestAPIError):
+class InternalServerError(RestAPIError):
    def __init__(self, payload=None):
        super().__init__(500, payload)


-class FormatError(Exception):
-    def __init__(self, code, message):
-        self._code = code
-        self._message = message
+class AgentError(Exception):
+    def __init__(self, message):
+        self.message = message

    @property
-    def code(self):
-        return self._code
+    def message(self):
+        return self._message
+
+    @message.setter
+    def message(self, value):
+        self._message = value
+
+    def __str__(self):
+        return self.message
+
+class TooManyRequest(Exception):
+    def __init__(self, message):
+        self.message = message

    @property
    def message(self):
        return self._message

-    def __str__(self):
-        return self.__class__.__name__ + ': ' + self.message
+    @message.setter
+    def message(self, value):
+        self._message = value

+    def __str__(self):
+        return self.message

-class WorkflowkeyError(Exception):
-    def __init__(self, code, message):
-        self._code = code
-        self._message = message

-    @property
-    def code(self):
-        return self._code
+class LoginFailure(Exception):
+    def __init__(self, message):
+        self.message = message

    @property
    def message(self):
        return self._message

-    def __str__(self):
-        return self.__class__.__name__ + ': ' + self.message
+    @message.setter
+    def message(self, value):
+        self._message = value

+    def __str__(self):
+        return self.message

-class FileErrors(Exception):
-    def __init__(self, code, message):
-        self._code = code
-        self._message = message

-    @property
-    def code(self):
-        return self._code
+class ConnectionError(Exception):
+    def __init__(self, message):
+        self.message = message

    @property
    def message(self):
        return self._message

-    def __repr__(self):
-        return {"code": self.code, "message": self.__class__.__name__ + ': ' + self.message}
+    @message.setter
+    def message(self, value):
+        self._message = value

+    def __str__(self):
+        return self.message

-class FileEncodingError(Exception):
-    def __init__(self, code, message):
-        self._code = code
-        self._message = message

-    @property
-    def code(self):
-        return self._code
+class ParamMissing(Exception):
+    def __init__(self, message):
+        self.message = message

    @property
    def message(self):
        return self._message

-    def __str__(self):
-        return self.__class__.__name__ + ': ' + self.message
+    @message.setter
+    def message(self, value):
+        self._message = value

+    def __str__(self):
+        return self.message

-class ServiceError(Exception):
-    def __init__(self, code, message):
-        self._code = code
-        self._message = message

-    @property
-    def code(self):
-        return self._code
+class FormatError(Exception):
+    def __init__(self, message):
+        self.message = message

    @property
    def message(self):
        return self._message

+    @message.setter
+    def message(self, value):
+        self._message = value
+
    def __str__(self):
-        return self.__class__.__name__ + ': ' + self.message
+        return self.message


 class ValueMissing(Exception):
@@ -120,5 +130,18 @@ class ValueMissing(Exception):
    def __str__(self):
        return self.message

-    def __repr__(self):
-        return self.message
+
+class ProxyFailure(Exception):
+    def __init__(self, region):
+        self.region = region
+
+    @property
+    def region(self):
+        return self._region
+
+    @region.setter
+    def region(self, value):
+        self._region = value
+
+    def __str__(self):
+        return 'Proxy failure | region : {0}'.format(self.region)
--- a/src/common/scrapy_utils.py
+++ b/src/common/scrapy_utils.py
- # scrapy config goes here !
\ No newline at end of file
+import scrapy
+from config import JOB_OUTPUT_PATH
+from scrapy.pipelines.files import FilesPipeline
+from scrapy.utils.project import get_project_settings
+
+
+# getting settings for scrapy crawlers
+def get_scrapy_settings(jobid):
+    settings = {
+        **get_project_settings(),
+        'ITEM_PIPELINES': {'common.scrapy_utils.CustomFilesPipeline': 1},
+        'FILES_STORE': str(JOB_OUTPUT_PATH + '/{0}/'.format(jobid)),
+        'USER_AGENT': "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
+    }
+    return settings
+
+
+# custom itemClass for files purpose only
+class CustomScrapyFilesItem(scrapy.Item):
+    file_name = scrapy.Field()
+    file_urls = scrapy.Field()
+    files = scrapy.Field
+
+# custom file pipeline
+
+
+class CustomFilesPipeline(FilesPipeline):
+    def get_media_requests(self, item, info):
+        for my_url in item.get('file_urls', []):
+            yield scrapy.Request(my_url, meta={'file_name': item.get('file_name')})
+
+    def file_path(self, request, response=None, info=None):
+        return request.meta['file_name']
--- a/src/common/scraping_utils.py
+++ b/src/common/scraping_utils.py
 import os
 from pathlib import Path
+from threading import Lock

 import config
-from selenium import webdriver
+import yaml
+from seleniumwire import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

 chrome_path = Service(config.CHROMEDRIVER_PATH)
+chrome_lock = Lock()
+
+
+def get_proxy_addr(country):
+    with open(config.PROXY_FILE, 'r') as proxylist:
+        proxy_list = yaml.safe_load(proxylist)['proxies']
+    data = proxy_list.get(country, None)
+    if data is None:
+        proxy_url = None
+    else:
+        proxy_url = data.get('URL', None)
+        proxy_username = data.get('username', None)
+        proxy_password = data.get('password', None)
+        del data, proxy_list
+        if proxy_url is None:
+            proxy_url = ''
+        elif proxy_username is None or proxy_password is None:
+            proxy_url = 'https://'+proxy_url
+        else:
+            proxy_url = 'https://{0}:{1}@{2}'.format(
+                proxy_username, proxy_password, proxy_url)
+    return proxy_url


 def enable_download_headless(browser, download_dir):
@@ -18,18 +42,40 @@ def enable_download_headless(browser, download_dir):
    browser.execute("send_command", params)


-def get_driver(temp_directory):
+def get_driver(temp_directory, agentContext):
+    # start lock
+    chrome_lock.acquire(timeout=60)
+
+    # Set Directory
    Path(temp_directory).mkdir(parents=True, exist_ok=True)
    download_dir = os.path.join(temp_directory)
-    chrome_options = Options()
+
+    # Chrome Capibilities
    d = DesiredCapabilities.CHROME
    d['goog:loggingPrefs'] = {'browser': 'ALL'}
+
+    # seleniumwire_option -> proxy server
+    if hasattr(agentContext, 'proxy'):
+        proxy_addr = get_proxy_addr(agentContext.proxy)
+        print('proxy_addr->', proxy_addr)
+        if proxy_addr is not None:
+            wire_option = {
+                'proxy': {
+                    'http': proxy_addr,
+                    'https': proxy_addr,
+                    'no_proxy': 'localhost,127.0.0.1'
+                }
+            }
+
+    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--window-size=1920x1080")
    chrome_options.add_argument("--disable-notifications")
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--verbose')
    chrome_options.add_argument('--log-level=3')
+    chrome_options.add_argument(
+        "--disable-blink-features=AutomationControlled")
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.page_load_strategy = 'normal'
@@ -47,4 +93,7 @@ def get_driver(temp_directory):
    driver = webdriver.Chrome(
        service=chrome_path, options=chrome_options, desired_capabilities=d)
    enable_download_headless(driver, download_dir)
+
+    # release lock
+    chrome_lock.release()
    return driver
--- a/src/config.py
+++ b/src/config.py
@@ -9,7 +9,7 @@ SERVER_CORS = False
 SERVER_STATIC_PATH = ''

 # API configuration
-API_URL_PREFIX = "/general"
+API_URL_PREFIX = "/api"
 BUILD_NUMBER = 'BUILD_NUMBER_001'
 API_MANDATORY_PARAMS = ['agentId', 'search', 'type']

@@ -45,6 +45,9 @@ MAX_WAITING_JOBS = int(os.environ.get('MAX_WAITING_JOBS', 10))
 # ------------------ElasticSearch DB variables--------------------

 ELASTIC_DB_URL = os.environ.get('ELASTIC_DB_URL')
+ELASTIC_DB_CERT = os.environ.get('ELASTIC_DB_CERT')
+ELASTIC_DB_USERNAME = os.environ.get('ELASTIC_DB_USERNAME')
+ELASTIC_DB_PASSWORD = os.environ.get('ELASTIC_DB_PASSWORD')

 # ES index variables
 ES_LOG_INDEX = 'general-app-logs'
@@ -54,6 +57,7 @@ ES_DATA_INDEX = 'general-crawled-data'
 # ------------------Logging variables-----------------------------

 JOB_OUTPUT_PATH = "output"
+JOB_OUTPUT_PATH = '/'.join(os.getcwd().split('/')[:-1]) + '/' + JOB_OUTPUT_PATH

 # JobStatus variables
 JOB_RUNNING_STATUS = 'RUNNING'
@@ -62,6 +66,10 @@ JOB_COMPLETED_FAILED_STATUS = 'COMPLETED_FAILED'

 # ------------------Driver Variables-------------------------------

-CHROMEDRIVER_PATH = 'C:\\Drivers\\chromedriver_win32\\chromedriver.exe'
+CHROMEDRIVER_PATH = '/usr/local/bin/chromedriver'
+
+# ------------------Proxy Config File------------------------------
+
+PROXY_FILE = 'proxy_config.yml'

 # -----------------------------------------------------------------
--- a/src/models/__init__.py
+++ b/src/models/__init__.py
 from .response import CustomResponse
-from .status import Status
+from .status import Status, get_status

 from .job import JobModel
 from .agent_utils import AgentUtils
--- a/src/models/agent_class.py
+++ b/src/models/agent_class.py
 class Agent(object):
-    def __init__(self, agentId, description, provider, scripts, URL):
-        self.provider = provider
-        self.description = description
-        self.agentId = agentId
-        self.scripts = scripts
-        self.URL = URL
+    def __init__(self, agentData):
+        self.provider = agentData['provider']
+        self.description = agentData['description']
+        self.agentId = agentData['agentId']
+        self.URL = agentData['URL']
+        self.scripts = agentData['scripts']
+        self.proxy = agentData.get('proxy', None)

    @property
    def agentId(self):
@@ -46,8 +47,16 @@ class Agent(object):
    def URL(self, value):
        self._URL = value

+    @property
+    def proxy(self):
+        return self._proxy
+
+    @proxy.setter
+    def proxy(self, value):
+        self._proxy = value
+
    def __str__(self):
-        str_1 = 'id: {0} , description: {1} , provider: {2} , scripts: {3} , URL: {4}'
+        str_1 = 'id: {0} , description: {1} , provider: {2} , scripts: {3} , URL: {4} , Proxy: {5}'
        str_1 = str_1.format(self.agentId, self.description,
-                             self.provider, self.scripts, self.URL)
+                             self.provider, self.scripts, self.URL, self.proxy)
        return str_1
--- a/src/models/agent_utils.py
+++ b/src/models/agent_utils.py
@@ -29,8 +29,8 @@ class AgentUtils:
        file_pi = open(self.filepath, 'wb')
        pickle.dump(agent_list, file_pi)

-    def addAgent(self, agentId, description, provider, scripts, URL):
-        agent = Agent(agentId, description, provider, scripts, URL)
+    def addAgent(self, agentData):
+        agent = Agent(agentData)
        agent_list = self.__readPklFile()
        for old_agent in agent_list:
            if old_agent.agentId == agent.agentId:
@@ -49,5 +49,6 @@ class AgentUtils:
            agent['provider'] = old_agent.provider
            agent['scripts'] = old_agent.scripts
            agent['URL'] = old_agent.URL
+            agent['proxy'] = old_agent.proxy
            return_list.append(agent)
        return return_list
--- a/src/models/status.py
+++ b/src/models/status.py
@@ -57,3 +57,23 @@ class Status(enum.Enum):
        'http': {'status': 400},
        'why': 'please refer api contract to check your request structure'
    }
+    ERR_TOO_MANY_REQUEST = {
+        'ok': False,
+        'http': {'status': 429},
+        'why': 'too many requests'
+    }
+
+
+def get_status(exceptionType):
+    e_dict = {
+        'AgentError': Status.ERR_INVALID_DATA,
+        'ParamMissing': Status.ERR_MISSING_PARAMETERS,
+        'FormatError': Status.ERR_INVALID_DATA,
+        'ValueMissing': Status.ERR_INVALID_DATA,
+        'TooManyRequest': Status.ERR_TOO_MANY_REQUEST
+    }
+    if str(exceptionType) in e_dict.keys():
+        status = e_dict[str(exceptionType)]
+    else:
+        status = Status.FAILURE
+    return status
--- a/src/proxy_config.yml
+++ b/src/proxy_config.yml
+proxies:
+  COUNTRY:
+    URL: URL
+    username: "USERNAME"
+    password: "PASSWORD"
--- a/src/repositories/agent.py
+++ b/src/repositories/agent.py
-import uuid
+import os
 from concurrent.futures import ThreadPoolExecutor
+from copy import deepcopy

 import config
-from common.elastic_wrapper import Log
 from models import AgentUtils
+from utilities import AgentContext
+from common import TooManyRequest
+
+AGENTS_PKL_PATH = os.path.join(
+    config.SERVER_STATIC_PATH, config.AGENT_CONFIG_PKL_PATH)


 class AgentRepo:
    def __init__(self):
        self.agentUtils = AgentUtils()
+        self.agentUtils.filepath = AGENTS_PKL_PATH
+        self.agent_list = self.agentUtils.listAgents()
        self.executor = ThreadPoolExecutor(max_workers=config.MAX_RUNNING_JOBS)

-    def list(self, filepath):
-        self.agentUtils.filepath = filepath
-        result = self.agentUtils.listAgents()
+    def get_agent_data(self, agentId):
+        data = None
+        for agent in self.agent_list:
+            if agent['agentId'] == agentId:
+                data = agent
+                break
+        return data
+
+    def list(self):
+        result = deepcopy(self.agent_list)
+        result = [{k: v for k, v in agent.items() if v is not None}
+                  for agent in result]
        for agent in result:
            agent.pop('scripts')
        return result

-    def run(self, agentRunContext, filepath):
-        threadStarted = False
-        agentRunContext.jobId = str(uuid.uuid4())
-        self.agentUtils.filepath = filepath
-        agents_list = self.agentUtils.listAgents()
-        threadStarted = False
-        for agent in agents_list:
-            if agent['agentId'] == agentRunContext.requestBody['agentId']:
-                agentRunContext.URL = agent['URL']
-                threadStarted = True
-                if self.executor._work_queue.qsize() < config.MAX_WAITING_JOBS:
-                    log = Log(agentRunContext)
-                    log.job(config.JOB_RUNNING_STATUS, "JOB in waiting state.")
-                    del log
-                    self.executor.submit(
-                        agent['scripts'][config.AGENT_SCRIPT_TYPES[agentRunContext.jobType]], agentRunContext)
-                else:
-                    return {'message': 'Already many jobs are in Waiting ... Please retry after some time.'}
-        if threadStarted:
-            return {'jobId': agentRunContext.jobId}
+    def run(self, req_data):
+        output = None
+        if self.executor._work_queue.qsize() < config.MAX_WAITING_JOBS:
+            agent_data = self.get_agent_data(req_data.get('agentId', None))
+            if agent_data is not None:
+                agentContext = AgentContext(agent_data, req_data)
+                self.executor.submit(
+                    agent_data['scripts'][config.AGENT_SCRIPT_TYPES[agentContext.jobType]], agentContext)
+                output = {'jobId': agentContext.jobId}
+            else:
+                pass
        else:
-            return None
+            raise TooManyRequest('Already many jobs are in Waiting ... Please retry after some time.')
+        return output
--- a/src/requirements.txt
+++ b/src/requirements.txt
@@ -15,3 +15,5 @@ python-dateutil==2.8.1
 beautifulsoup4==4.9.3
 azure-storage-blob==12.10.0b1
 scrapy==2.6.1
+selenium-wire==4.6.4
+crochet==2.0.0
--- a/src/resources/agent.py
+++ b/src/resources/agent.py
@@ -3,42 +3,20 @@ import traceback

 import config
 from app import basic_auth
-from common import ValueMissing
 from flask import request
 from flask_restful import Resource
-from models import CustomResponse, Status
+from models import CustomResponse, get_status, Status
 from repositories import AgentRepo
-from utilities import AgentRunContext
+from common import AgentError

 agentRepo = AgentRepo()


-def mandatory_param(req):
-    e_value = Status.ERR_MISSING_PARAMETERS
-    param_list = list()
-    for param in config.API_MANDATORY_PARAMS:
-        if req.get(param) is None:
-            param_list.append(param)
-    if len(param_list) > 0:
-        return ",".join(param_list), e_value
-    else:
-        return None, e_value
-
-
-def check_job_type(req):
-    e_value = Status.ERR_INVALID_DATA
-    if req.get('type') in config.AGENT_SCRIPT_TYPES.keys():
-        return req.get('type'), e_value
-    else:
-        return None, e_value
-
-
 class AgentListResource(Resource):
    @basic_auth.required
    def get(self):
        try:
-            result = agentRepo.list(os.path.join(
-                config.SERVER_STATIC_PATH, config.AGENT_CONFIG_PKL_PATH))
+            result = agentRepo.list()
            if result != None:
                res = CustomResponse(Status.SUCCESS.value, result)
                return res.getres()
@@ -56,28 +34,19 @@ class AgentRunResource(Resource):
    @basic_auth.required
    def post(self):
        try:
-            req = request.get_json()
-            # check mandatory params
-            miss, e_value = mandatory_param(req)
-            if miss is not None:
-                raise ValueMissing(miss+' - mandatory')
-
-            # check if valid JOB_TYPE
-            miss, e_value = check_job_type(req)
-            if miss is None:
-                raise ValueMissing('invalid type')
-
-            agentRunContext = AgentRunContext(req, miss)
-            result = agentRepo.run(agentRunContext, os.path.join(
-                config.SERVER_STATIC_PATH, config.AGENT_CONFIG_PKL_PATH))
+            req_data = request.get_json()
+            result = agentRepo.run(req_data)
            if result != None:
                res = CustomResponse(Status.SUCCESS.value, result)
                return res.getres()
            else:
-                res = CustomResponse(
-                    Status.ERR_GLOBAL_INVALID_DATA.value, "Invalid Agent ID")
-                return res.getresjson(), 400
+                raise AgentError("Invalid Agent ID")
        except Exception as e:
            print(traceback.format_exc())
-            res = CustomResponse(e_value.value, str(e))
-            return res.getresjson(), 400
+            s_code = 400
+            e_class = str(type(e).__name__)
+            if e_class == 'TooManyRequest':
+                s_code = 429
+            res = CustomResponse(get_status(e_class).value, str(e))
+            return res.getresjson(), s_code
+
--- a/src/scripts/info/__init__.py
+++ b/src/scripts/info/__init__.py
 # Scrapy
-from .applied_scrapy import AppliedScrapy
 from .grainger_scrapy import GraingerScrapy
+from .rs_scrapy import RSScrapy

 # Selenium
 from .applied_selenium import AppliedSelenium
-from .grainger_selenium import GraingerSelenium
\ No newline at end of file
+
+from .no_scripts import NoScripts
\ No newline at end of file
--- a/src/scripts/info/applied_selenium.py
+++ b/src/scripts/info/applied_selenium.py
@@ -9,18 +9,18 @@ from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait


-def AppliedSelenium(agentRunContext):
-    log = Log(agentRunContext)
+def AppliedSelenium(agentContext):
+    log = agentContext.log
    try:

        url = 'https://www.applied.com/search?q=:relevance:FTS:' + \
-            agentRunContext.requestBody['search'] + \
+            agentContext.requestBody['search'] + \
            '&page=<page>&search-category=all&override=true&isLevelUp=false'
-        download_dir_id = str(agentRunContext.jobId)
+        download_dir_id = str(agentContext.jobId)
        download_dir = os.path.join(
            os.getcwd(), 'temp', 'temp-' + download_dir_id)

-        driver = get_driver(download_dir)
+        driver = get_driver(download_dir, agentContext)
        driver.maximize_window()

        driver.get(url)
@@ -28,7 +28,7 @@ def AppliedSelenium(agentRunContext):
        log.job(config.JOB_RUNNING_STATUS, 'Job Started')

        try:
-            wait(EC.element_to_be_clickable(
+            wait.until(EC.element_to_be_clickable(
                (By.ID, "CybotCookiebotDialogBodyButtonAccept")))
            driver.find_element_by_id(
                "CybotCookiebotDialogBodyButtonAccept").click()
@@ -37,8 +37,6 @@ def AppliedSelenium(agentRunContext):
        for page_no in range(1, 1000):
            driver.get(url.replace('<page>', str(page_no)))
            time.sleep(2)
-            if 'page' not in driver.current_url:
-                break

            wait.until(EC.presence_of_element_located(
                (By.CLASS_NAME, 'product-list')))
@@ -61,28 +59,37 @@ def AppliedSelenium(agentRunContext):
                    'item': driver.find_element_by_xpath('//div[@class="customer-part-number"]').text.strip()
                }

-                item_dict['short_description'] = list()
-                des = driver.find_element_by_class_name('short-description')
-                for ele in des.find_elements_by_xpath('.//li'):
-                    item_dict['short_description'].append(ele.text.strip())
+                try:
+                    item_dict['short_description'] = list()
+                    des = driver.find_element_by_class_name('short-description')
+                    for ele in des.find_elements_by_xpath('.//li'):
+                        item_dict['short_description'].append(ele.text.strip())
+                except:
+                    log.info('info', 'No Short-Description Available for {0}'.format(item_dict['item_no']))

-                item_dict['specification'] = dict()
-                spe = driver.find_element_by_id('specifications')
-                for table in spe.find_elements_by_xpath('.//table'):
-                    for tr_ele in table.find_elements_by_xpath('./tbody/tr'):
-                        key = str(tr_ele.find_element_by_xpath(
-                            './td[1]').text).strip()
-                        value = str(tr_ele.find_element_by_xpath(
-                            './td[2]').text).strip()
-                        item_dict['specification'][key] = value
+                try:
+                    item_dict['specification'] = dict()
+                    spe = driver.find_element_by_id('specifications')
+                    for table in spe.find_elements_by_xpath('.//table'):
+                        for tr_ele in table.find_elements_by_xpath('./tbody/tr'):
+                            key = str(tr_ele.find_element_by_xpath(
+                                './td[1]').text).strip()
+                            value = str(tr_ele.find_element_by_xpath(
+                                './td[2]').text).strip()
+                            item_dict['specification'][key] = value
+                except:
+                    log.info('info', 'No Specification Available for {0}'.format(item_dict['item_no']))

-                print(item_dict['specification'])
                try:
                    log.data(item_dict)
                except:
                    pass
                driver.close()
                driver.switch_to.window(driver.window_handles[0])
+            
+            if 'page' not in driver.current_url:
+                break
+        
        log.job(config.JOB_COMPLETED_SUCCESS_STATUS,
                'Successfully scraped all data')
    except Exception as e:

--- a/src/scripts/info/grainger_scrapy.py
+++ b/src/scripts/info/grainger_scrapy.py
 import config
+import crochet
 import scrapy
-from common import Log
+from common import Log,get_scrapy_settings
 from scrapy.crawler import CrawlerRunner
 from twisted.internet import reactor

+crochet.setup()
+
 # search_param=do630 voltage regulator (via category list)
 # search_param=do 360 voltage (via product list)
 # search_param=61HH68 (via direct product page)
@@ -12,97 +15,95 @@ null = 'null'
 true = 'true'
 false = 'false'

+# ---------------------SCRAPING-CLASS---------------------

-
-def GraingerScrapy(agentRunContext):
+class GraingerScrapy(scrapy.Spider):
+    name = 'GraingerScrapy'
    
-    log = Log(agentRunContext)
-
-    class GraingerScrapy(scrapy.Spider):
-        name = 'GraingerScrapy'
-        user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'
-        main_url = 'https://www.grainger.com/'

-        def __init__(self, search_param):
-            self.start_urls = [
-                "https://www.grainger.com/search?searchQuery="+search_param]
-            super().__init__()
-
-        def parse(self, response):
-            if 'search?' not in response.url:
-                yield scrapy.Request(url=response.url, callback=self.collect_data)
+    def __init__(self, search_param,eslog):
+        self.eslog = eslog
+        self.main_url = 'https://www.grainger.com/'
+        self.start_urls = [
+            "https://www.grainger.com/search?searchQuery="+search_param]
+        super().__init__()
+
+    def parse(self, response):
+        if 'search?' not in response.url:
+            yield scrapy.Request(url=response.url, callback=self.collect_data)
+        else:
+            if len(response.css('section[aria-label="Category products"]')) > 0:
+                script = [i.strip() for i in response.css('script::text').extract(
+                ) if i.strip().startswith('window.__PRELOADED_STATE__')][0]
+                script = eval(script.split(
+                    '=', 1)[-1].split('window.__UI_CONFIG__')[0].strip()[:-1])
+                products = list(script['category']['category']
+                                ['skuToProductMap'].keys())
+                href = '/product/info?productArray='+','.join(products)
+                yield scrapy.Request(url=self.main_url+href, callback=self.get_products)
            else:
-                if len(response.css('section[aria-label="Category products"]')) > 0:
-                    script = [i.strip() for i in response.css('script::text').extract(
-                    ) if i.strip().startswith('window.__PRELOADED_STATE__')][0]
-                    script = eval(script.split(
-                        '=', 1)[-1].split('window.__UI_CONFIG__')[0].strip()[:-1])
-                    products = list(script['category']['category']
-                                    ['skuToProductMap'].keys())
-                    href = '/product/info?productArray='+','.join(products)
-                    yield scrapy.Request(url=self.main_url+href, callback=self.get_products)
-                else:
-                    # iterate every categories
-                    for href in response.css('a.route::attr(href)').extract():
-                        yield scrapy.Request(url=self.main_url+href, callback=self.parse_category_page)
-
-        def parse_category_page(self, response):
-            script = [i.strip() for i in response.css('script::text').extract(
-            ) if i.strip().startswith('window.__PRELOADED_STATE__')][0]
-            script = eval(script.split('=', 1)
-                        [-1].split('window.__UI_CONFIG__')[0].strip()[:-1])
-            cat_id = script['category']['category']['id']
-            for i in script['category']['collections']:
-                coll_id = i['id']
-                url1 = self.main_url + \
-                    '/experience/pub/api/products/collection/{0}?categoryId={1}'
-                yield scrapy.Request(url=url1.format(coll_id, cat_id), callback=self.get_products)
-
-        def get_products(self, response):
-            data = response.json()
-            if 'products' in data.keys():
-                for i in data['products']:
+                # iterate every categories
+                for href in response.css('a.route::attr(href)').extract():
+                    yield scrapy.Request(url=self.main_url+href, callback=self.parse_category_page)
+
+    def parse_category_page(self, response):
+        script = [i.strip() for i in response.css('script::text').extract(
+        ) if i.strip().startswith('window.__PRELOADED_STATE__')][0]
+        script = eval(script.split('=', 1)
+                    [-1].split('window.__UI_CONFIG__')[0].strip()[:-1])
+        cat_id = script['category']['category']['id']
+        for i in script['category']['collections']:
+            coll_id = i['id']
+            url1 = self.main_url + \
+                '/experience/pub/api/products/collection/{0}?categoryId={1}'
+            yield scrapy.Request(url=url1.format(coll_id, cat_id), callback=self.get_products)
+
+    def get_products(self, response):
+        data = response.json()
+        if 'products' in data.keys():
+            for i in data['products']:
+                yield scrapy.Request(url=self.main_url+i['productDetailUrl'], callback=self.collect_data)
+        else:
+            for i in data.values():
+                if type(i) == dict and 'productDetailUrl' in i.keys():
                    yield scrapy.Request(url=self.main_url+i['productDetailUrl'], callback=self.collect_data)
-            else:
-                for i in data.values():
-                    if type(i) == dict and 'productDetailUrl' in i.keys():
-                        yield scrapy.Request(url=self.main_url+i['productDetailUrl'], callback=self.collect_data)
-
-        def collect_data(self, response):
-            data = dict()
-            main_content = response.css('.product-detail__content--large')
-            spec = response.css('.specifications')
-            data = {
-                'brand': main_content.css('.product-detail__brand--link::text').get().strip(),
-                'product-heading': main_content.css('.product-detail__heading::text').get().strip(),
-                'url': response.url
-            }
-            for li in main_content.css('.product-detail__product-identifiers-content'):
-                key = li.css(
-                    '.product-detail__product-identifiers-label::text').get().strip()
-                value = li.css(
-                    '.product-detail__product-identifiers-description::text').extract()
-                value = [str(i).strip() for i in value] if len(
-                    value) > 1 else str(value[0]).strip()
-                data[key] = value
-
-            for li in spec.css('.specifications__item'):
-                key = li.css('.specifications__description::text').get()
-                value = li.css('.specifications__value::text').extract()
-                value = [str(i).strip() for i in value] if len(
-                    value) > 1 else str(value[0]).strip()
-                data[key] = value
-
-            log.data(data)

+    def collect_data(self, response):
+        data = dict()
+        main_content = response.css('.product-detail__content--large')
+        spec = response.css('.specifications')
+        data = {
+            'brand': main_content.css('.product-detail__brand--link::text').get().strip(),
+            'product-heading': main_content.css('.product-detail__heading::text').get().strip(),
+            'url': response.url
+        }
+        for li in main_content.css('.product-detail__product-identifiers-content'):
+            key = li.css(
+                '.product-detail__product-identifiers-label::text').get().strip()
+            value = li.css(
+                '.product-detail__product-identifiers-description::text').extract()
+            value = [str(i).strip() for i in value] if len(
+                value) > 1 else str(value[0]).strip()
+            data[key] = value
+
+        for li in spec.css('.specifications__item'):
+            key = li.css('.specifications__description::text').get()
+            value = li.css('.specifications__value::text').extract()
+            value = [str(i).strip() for i in value] if len(
+                value) > 1 else str(value[0]).strip()
+            data[key] = value
+
+        self.eslog.data(data)
+
+# ---------------------SCRAPING-FUNCTION---------------------
+
+def GraingerScrapy(agentContext):
+    log = Log(agentContext)
    log.job(config.JOB_RUNNING_STATUS, 'Job Started')

-    runner = CrawlerRunner()
-
-    d = runner.crawl(
-        GraingerScrapy, search_param=agentRunContext.requestBody.get('search'))
-    d.addBoth(lambda _: reactor.stop())
-    reactor.run()
-
+    runner = CrawlerRunner(settings=get_scrapy_settings(agentContext.jobId))
+    runner.crawl(
+        GraingerScrapy, search_param=agentContext.requestBody.get('search'), eslog=log)
+    runner.join()
    log.job(config.JOB_COMPLETED_SUCCESS_STATUS,
            'Successfully scraped all data')
\ No newline at end of file