Commit 53b4b040 authored by pushkar191098's avatar pushkar191098
Browse files

refactor: proxy, agentContext, chromeLock

parent 9c23e379
No related merge requests found
Showing with 551 additions and 263 deletions
+551 -263
docs/proxy.md 0 → 100644
## Proxy Setup
#### Requirements:
[Squid](https://ubuntu.com/server/docs/proxy-servers-squid)
In Linux, enter the following commands to install Squid
```
sudo apt-get update
sudo apt-get install squid
```
#### To create the basic_auth to protect the proxy server
```
sudo touch /etc/squid/passwords
sudo chmod 777 /etc/squid/passwords
sudo htpasswd -c /etc/squid/passwords [USERNAME]
```
Replace [USERNAME] with your username. You will be prompted for entering the password. Enter and confirm it.
#### To Test the password store
`/usr/lib/squid3/basic_ncsa_auth /etc/squid/passwords`
After executing this line the console will look like its hung, there is a prompt without any text in it.
Enter USERNAME PASSWORD (replacing these with your specific username and password) and hit return.
You should receive the response "OK".
If not, review the error message, your username/password might be incorrect. Its also possible basic_ncsa_auth is located on a different path (e.g. lib64).
#### To configure the Squid configuration file
The Squid configuration file is found at `/etc/squid/squid.conf`.
Update the file with below configuration
```
auth_param basic program /usr/lib/squid/basic_ncsa_auth /etc/squid/passwords
#auth_param basic realm Squid proxy-caching web server
auth_param basic realm proxy
auth_param basic credentialsttl 2400 hours
auth_param basic casesensitive off
acl authenticated proxy_auth REQUIRED
http_access allow authenticated
dns_v4_first on
forwarded_for delete
via off
http_access deny all
```
Save the file and exit. Once completed restart the squid service
```
sudo systemctl restart squid
```
---
## Proxy VM Requirement
Server Specification
* Instance Type: Standard B2s (2 vcpus, 4 GiB memory)
* Region : as-required* (eg: Sweden-Central)
>>>Note: A proxy VM needs to be created in the geo-region of agent websites as needed.
---
## Crawler Architectural changes for Geo-restricted websites
#### proxy Configuration
Proxy Configuration lies in `/src/proxy_config.yml`
syntax:
```
proxies:
# proxy with basic_auth (private proxy)
COUNTRY1:
URL: "{IP-ADDR}:{PORT}"
username: "{BASIC_AUTH_USERNAME}"
password: "{BASIC_AUTH_PASSWORD}"
# proxy with no_auth (public proxy)
COUNTRY2:
URL: "{IP-ADDR}:{PORT}"
```
example:
```
proxies:
SWEDEN:
URL: "123.12.12.3:8796"
username: "XXXX"
password: "YYYY"
FINLAND:
URL: "143.62.12.93:8116"
NORWAY:
URL: "12.45.6.1:8080"
username: "XYZ"
password: "ABC"
```
#### To set proxy to agent
you can set proxy-server to agent in `/src/static/agent_configs/agents.json`
syntax:
```
[
{
"agentId": "AGENT-XYX",
"description": "Crawler For Xyz",
"provider": "Xyz",
"URL": "https://xyz.com",
"proxy": "COUNTRY",
"scripts": {
"info": "Xyzcrawler",
"pdf": "Xyzcrawler"
}
}
]
```
example:
```
[
{
"agentId": "RS-SCRAPY",
"description": "Crawler For RS Components",
"provider": "RS Components",
"URL": "https://in.rsdelivers.com",
"proxy": "FINLAND",
"scripts": {
"info": "RSScrapy",
"pdf": "NoScripts"
}
}
]
```
From the above example, agent `MESTRO-SKELLEFTEA-KRAFT` will not use any proxy.
......@@ -27,9 +27,9 @@ with open(os.path.join(config.SERVER_STATIC_PATH, config.AGENT_CONFIG_PATH), 'r'
agent_list = json.load(f)
__import__("scripts")
my_scripts = sys.modules["scripts"]
# serialize agent config
agentUtils = AgentUtils()
agentUtils.filepath = os.path.join(
config.SERVER_STATIC_PATH, config.AGENT_CONFIG_PKL_PATH)
......@@ -41,11 +41,8 @@ for i in range(len(agent_list)-1, len(agent_list)-len_diff-1, -1):
for type in config.AGENT_SCRIPT_TYPES.values():
agent_script[type] = my_scripts.__dict__[
type].__dict__[agent['scripts'][type]]
agentUtils.addAgent(agent['agentId'],
agent['description'],
agent['provider'],
agent_script,
agent['URL'])
agent['scripts'] = agent_script
agentUtils.addAgent(agent)
# server CORS policy
......
from .scraping_utils import get_driver
from .errors import *
from .elastic_wrapper import Log
from .errors import ValueMissing, FormatError, BadRequestError
from .blob_storage import BlobStorage
from .selenium_utils import get_driver
from .scrapy_utils import CustomScrapyFilesItem, get_scrapy_settings
......@@ -5,7 +5,7 @@ from azure.storage.blob import BlobServiceClient
class BlobStorage(object):
def __init__(self,overwrite=False):
def __init__(self, overwrite=False):
self.blob_service_client = BlobServiceClient(
account_url=config.BLOB_ACCOUNT_URL, credential=config.BLOB_SAS_TOKEN)
self.root_folder = None
......@@ -30,11 +30,12 @@ class BlobStorage(object):
def set_agent_folder(self, agent_folder):
self.root_folder = agent_folder
def upload_file(self,file_name,file_contents):
upload_file_path = os.path.join(self.root_folder,file_name)
blob_client = self.blob_service_client.get_blob_client(container=config.CONTAINER_NAME,blob=upload_file_path)
def upload_file(self, file_name, file_contents):
upload_file_path = os.path.join(self.root_folder, file_name)
blob_client = self.blob_service_client.get_blob_client(
container=config.CONTAINER_NAME, blob=upload_file_path)
try:
blob_client.upload_blob(file_contents,overwrite=self.overwrite)
blob_client.upload_blob(file_contents, overwrite=self.overwrite)
except Exception as e:
return False,str(e)
return True,'true'
return False, str(e)
return True, 'true'
......@@ -10,15 +10,17 @@ class Log(object):
def from_default(cls):
return cls(None)
def __init__(self, agentRunContext):
self.agentRunContext = agentRunContext
self.es_client = Elasticsearch([config.ELASTIC_DB_URL])
def __init__(self, agentContext):
self.agentContext = agentContext
self.es_client = Elasticsearch([config.ELASTIC_DB_URL], ca_certs=config.ELASTIC_DB_CERT, http_auth=[
config.ELASTIC_DB_USERNAME, config.ELASTIC_DB_PASSWORD])
def __populate_context(self):
data = {
'agentId': self.agentRunContext.requestBody['agentId'],
'jobId': self.agentRunContext.jobId,
'jobType': self.agentRunContext.jobType,
'agentId': self.agentContext.requestBody['agentId'],
'jobId': self.agentContext.jobId,
'jobType': self.agentContext.jobType,
'search': self.agentContext.requestBody['search'],
'timestamp': int(time.time()*1000),
'buildNumber': config.BUILD_NUMBER
}
......
......@@ -15,94 +15,104 @@ class BadRequestError(RestAPIError):
super().__init__(400, payload)
class InternalServerErrorError(RestAPIError):
class InternalServerError(RestAPIError):
def __init__(self, payload=None):
super().__init__(500, payload)
class FormatError(Exception):
def __init__(self, code, message):
self._code = code
self._message = message
class AgentError(Exception):
def __init__(self, message):
self.message = message
@property
def code(self):
return self._code
def message(self):
return self._message
@message.setter
def message(self, value):
self._message = value
def __str__(self):
return self.message
class TooManyRequest(Exception):
def __init__(self, message):
self.message = message
@property
def message(self):
return self._message
def __str__(self):
return self.__class__.__name__ + ': ' + self.message
@message.setter
def message(self, value):
self._message = value
def __str__(self):
return self.message
class WorkflowkeyError(Exception):
def __init__(self, code, message):
self._code = code
self._message = message
@property
def code(self):
return self._code
class LoginFailure(Exception):
def __init__(self, message):
self.message = message
@property
def message(self):
return self._message
def __str__(self):
return self.__class__.__name__ + ': ' + self.message
@message.setter
def message(self, value):
self._message = value
def __str__(self):
return self.message
class FileErrors(Exception):
def __init__(self, code, message):
self._code = code
self._message = message
@property
def code(self):
return self._code
class ConnectionError(Exception):
def __init__(self, message):
self.message = message
@property
def message(self):
return self._message
def __repr__(self):
return {"code": self.code, "message": self.__class__.__name__ + ': ' + self.message}
@message.setter
def message(self, value):
self._message = value
def __str__(self):
return self.message
class FileEncodingError(Exception):
def __init__(self, code, message):
self._code = code
self._message = message
@property
def code(self):
return self._code
class ParamMissing(Exception):
def __init__(self, message):
self.message = message
@property
def message(self):
return self._message
def __str__(self):
return self.__class__.__name__ + ': ' + self.message
@message.setter
def message(self, value):
self._message = value
def __str__(self):
return self.message
class ServiceError(Exception):
def __init__(self, code, message):
self._code = code
self._message = message
@property
def code(self):
return self._code
class FormatError(Exception):
def __init__(self, message):
self.message = message
@property
def message(self):
return self._message
@message.setter
def message(self, value):
self._message = value
def __str__(self):
return self.__class__.__name__ + ': ' + self.message
return self.message
class ValueMissing(Exception):
......@@ -120,5 +130,18 @@ class ValueMissing(Exception):
def __str__(self):
return self.message
def __repr__(self):
return self.message
class ProxyFailure(Exception):
def __init__(self, region):
self.region = region
@property
def region(self):
return self._region
@region.setter
def region(self, value):
self._region = value
def __str__(self):
return 'Proxy failure | region : {0}'.format(self.region)
# scrapy config goes here !
\ No newline at end of file
import scrapy
from config import JOB_OUTPUT_PATH
from scrapy.pipelines.files import FilesPipeline
from scrapy.utils.project import get_project_settings
# getting settings for scrapy crawlers
def get_scrapy_settings(jobid):
settings = {
**get_project_settings(),
'ITEM_PIPELINES': {'common.scrapy_utils.CustomFilesPipeline': 1},
'FILES_STORE': str(JOB_OUTPUT_PATH + '/{0}/'.format(jobid)),
'USER_AGENT': "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
}
return settings
# custom itemClass for files purpose only
class CustomScrapyFilesItem(scrapy.Item):
file_name = scrapy.Field()
file_urls = scrapy.Field()
files = scrapy.Field
# custom file pipeline
class CustomFilesPipeline(FilesPipeline):
def get_media_requests(self, item, info):
for my_url in item.get('file_urls', []):
yield scrapy.Request(my_url, meta={'file_name': item.get('file_name')})
def file_path(self, request, response=None, info=None):
return request.meta['file_name']
import os
from pathlib import Path
from threading import Lock
import config
from selenium import webdriver
import yaml
from seleniumwire import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
chrome_path = Service(config.CHROMEDRIVER_PATH)
chrome_lock = Lock()
def get_proxy_addr(country):
with open(config.PROXY_FILE, 'r') as proxylist:
proxy_list = yaml.safe_load(proxylist)['proxies']
data = proxy_list.get(country, None)
if data is None:
proxy_url = None
else:
proxy_url = data.get('URL', None)
proxy_username = data.get('username', None)
proxy_password = data.get('password', None)
del data, proxy_list
if proxy_url is None:
proxy_url = ''
elif proxy_username is None or proxy_password is None:
proxy_url = 'https://'+proxy_url
else:
proxy_url = 'https://{0}:{1}@{2}'.format(
proxy_username, proxy_password, proxy_url)
return proxy_url
def enable_download_headless(browser, download_dir):
......@@ -18,18 +42,40 @@ def enable_download_headless(browser, download_dir):
browser.execute("send_command", params)
def get_driver(temp_directory):
def get_driver(temp_directory, agentContext):
# start lock
chrome_lock.acquire(timeout=60)
# Set Directory
Path(temp_directory).mkdir(parents=True, exist_ok=True)
download_dir = os.path.join(temp_directory)
chrome_options = Options()
# Chrome Capibilities
d = DesiredCapabilities.CHROME
d['goog:loggingPrefs'] = {'browser': 'ALL'}
# seleniumwire_option -> proxy server
if hasattr(agentContext, 'proxy'):
proxy_addr = get_proxy_addr(agentContext.proxy)
print('proxy_addr->', proxy_addr)
if proxy_addr is not None:
wire_option = {
'proxy': {
'http': proxy_addr,
'https': proxy_addr,
'no_proxy': 'localhost,127.0.0.1'
}
}
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--verbose')
chrome_options.add_argument('--log-level=3')
chrome_options.add_argument(
"--disable-blink-features=AutomationControlled")
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.page_load_strategy = 'normal'
......@@ -47,4 +93,7 @@ def get_driver(temp_directory):
driver = webdriver.Chrome(
service=chrome_path, options=chrome_options, desired_capabilities=d)
enable_download_headless(driver, download_dir)
# release lock
chrome_lock.release()
return driver
......@@ -9,7 +9,7 @@ SERVER_CORS = False
SERVER_STATIC_PATH = ''
# API configuration
API_URL_PREFIX = "/general"
API_URL_PREFIX = "/api"
BUILD_NUMBER = 'BUILD_NUMBER_001'
API_MANDATORY_PARAMS = ['agentId', 'search', 'type']
......@@ -45,6 +45,9 @@ MAX_WAITING_JOBS = int(os.environ.get('MAX_WAITING_JOBS', 10))
# ------------------ElasticSearch DB variables--------------------
ELASTIC_DB_URL = os.environ.get('ELASTIC_DB_URL')
ELASTIC_DB_CERT = os.environ.get('ELASTIC_DB_CERT')
ELASTIC_DB_USERNAME = os.environ.get('ELASTIC_DB_USERNAME')
ELASTIC_DB_PASSWORD = os.environ.get('ELASTIC_DB_PASSWORD')
# ES index variables
ES_LOG_INDEX = 'general-app-logs'
......@@ -54,6 +57,7 @@ ES_DATA_INDEX = 'general-crawled-data'
# ------------------Logging variables-----------------------------
JOB_OUTPUT_PATH = "output"
JOB_OUTPUT_PATH = '/'.join(os.getcwd().split('/')[:-1]) + '/' + JOB_OUTPUT_PATH
# JobStatus variables
JOB_RUNNING_STATUS = 'RUNNING'
......@@ -62,6 +66,10 @@ JOB_COMPLETED_FAILED_STATUS = 'COMPLETED_FAILED'
# ------------------Driver Variables-------------------------------
CHROMEDRIVER_PATH = 'C:\\Drivers\\chromedriver_win32\\chromedriver.exe'
CHROMEDRIVER_PATH = '/usr/local/bin/chromedriver'
# ------------------Proxy Config File------------------------------
PROXY_FILE = 'proxy_config.yml'
# -----------------------------------------------------------------
from .response import CustomResponse
from .status import Status
from .status import Status, get_status
from .job import JobModel
from .agent_utils import AgentUtils
class Agent(object):
def __init__(self, agentId, description, provider, scripts, URL):
self.provider = provider
self.description = description
self.agentId = agentId
self.scripts = scripts
self.URL = URL
def __init__(self, agentData):
self.provider = agentData['provider']
self.description = agentData['description']
self.agentId = agentData['agentId']
self.URL = agentData['URL']
self.scripts = agentData['scripts']
self.proxy = agentData.get('proxy', None)
@property
def agentId(self):
......@@ -46,8 +47,16 @@ class Agent(object):
def URL(self, value):
self._URL = value
@property
def proxy(self):
return self._proxy
@proxy.setter
def proxy(self, value):
self._proxy = value
def __str__(self):
str_1 = 'id: {0} , description: {1} , provider: {2} , scripts: {3} , URL: {4}'
str_1 = 'id: {0} , description: {1} , provider: {2} , scripts: {3} , URL: {4} , Proxy: {5}'
str_1 = str_1.format(self.agentId, self.description,
self.provider, self.scripts, self.URL)
self.provider, self.scripts, self.URL, self.proxy)
return str_1
......@@ -29,8 +29,8 @@ class AgentUtils:
file_pi = open(self.filepath, 'wb')
pickle.dump(agent_list, file_pi)
def addAgent(self, agentId, description, provider, scripts, URL):
agent = Agent(agentId, description, provider, scripts, URL)
def addAgent(self, agentData):
agent = Agent(agentData)
agent_list = self.__readPklFile()
for old_agent in agent_list:
if old_agent.agentId == agent.agentId:
......@@ -49,5 +49,6 @@ class AgentUtils:
agent['provider'] = old_agent.provider
agent['scripts'] = old_agent.scripts
agent['URL'] = old_agent.URL
agent['proxy'] = old_agent.proxy
return_list.append(agent)
return return_list
......@@ -57,3 +57,23 @@ class Status(enum.Enum):
'http': {'status': 400},
'why': 'please refer api contract to check your request structure'
}
ERR_TOO_MANY_REQUEST = {
'ok': False,
'http': {'status': 429},
'why': 'too many requests'
}
def get_status(exceptionType):
e_dict = {
'AgentError': Status.ERR_INVALID_DATA,
'ParamMissing': Status.ERR_MISSING_PARAMETERS,
'FormatError': Status.ERR_INVALID_DATA,
'ValueMissing': Status.ERR_INVALID_DATA,
'TooManyRequest': Status.ERR_TOO_MANY_REQUEST
}
if str(exceptionType) in e_dict.keys():
status = e_dict[str(exceptionType)]
else:
status = Status.FAILURE
return status
proxies:
COUNTRY:
URL: URL
username: "USERNAME"
password: "PASSWORD"
import uuid
import os
from concurrent.futures import ThreadPoolExecutor
from copy import deepcopy
import config
from common.elastic_wrapper import Log
from models import AgentUtils
from utilities import AgentContext
from common import TooManyRequest
AGENTS_PKL_PATH = os.path.join(
config.SERVER_STATIC_PATH, config.AGENT_CONFIG_PKL_PATH)
class AgentRepo:
def __init__(self):
self.agentUtils = AgentUtils()
self.agentUtils.filepath = AGENTS_PKL_PATH
self.agent_list = self.agentUtils.listAgents()
self.executor = ThreadPoolExecutor(max_workers=config.MAX_RUNNING_JOBS)
def list(self, filepath):
self.agentUtils.filepath = filepath
result = self.agentUtils.listAgents()
def get_agent_data(self, agentId):
data = None
for agent in self.agent_list:
if agent['agentId'] == agentId:
data = agent
break
return data
def list(self):
result = deepcopy(self.agent_list)
result = [{k: v for k, v in agent.items() if v is not None}
for agent in result]
for agent in result:
agent.pop('scripts')
return result
def run(self, agentRunContext, filepath):
threadStarted = False
agentRunContext.jobId = str(uuid.uuid4())
self.agentUtils.filepath = filepath
agents_list = self.agentUtils.listAgents()
threadStarted = False
for agent in agents_list:
if agent['agentId'] == agentRunContext.requestBody['agentId']:
agentRunContext.URL = agent['URL']
threadStarted = True
if self.executor._work_queue.qsize() < config.MAX_WAITING_JOBS:
log = Log(agentRunContext)
log.job(config.JOB_RUNNING_STATUS, "JOB in waiting state.")
del log
self.executor.submit(
agent['scripts'][config.AGENT_SCRIPT_TYPES[agentRunContext.jobType]], agentRunContext)
else:
return {'message': 'Already many jobs are in Waiting ... Please retry after some time.'}
if threadStarted:
return {'jobId': agentRunContext.jobId}
def run(self, req_data):
output = None
if self.executor._work_queue.qsize() < config.MAX_WAITING_JOBS:
agent_data = self.get_agent_data(req_data.get('agentId', None))
if agent_data is not None:
agentContext = AgentContext(agent_data, req_data)
self.executor.submit(
agent_data['scripts'][config.AGENT_SCRIPT_TYPES[agentContext.jobType]], agentContext)
output = {'jobId': agentContext.jobId}
else:
pass
else:
return None
raise TooManyRequest('Already many jobs are in Waiting ... Please retry after some time.')
return output
......@@ -15,3 +15,5 @@ python-dateutil==2.8.1
beautifulsoup4==4.9.3
azure-storage-blob==12.10.0b1
scrapy==2.6.1
selenium-wire==4.6.4
crochet==2.0.0
......@@ -3,42 +3,20 @@ import traceback
import config
from app import basic_auth
from common import ValueMissing
from flask import request
from flask_restful import Resource
from models import CustomResponse, Status
from models import CustomResponse, get_status, Status
from repositories import AgentRepo
from utilities import AgentRunContext
from common import AgentError
agentRepo = AgentRepo()
def mandatory_param(req):
e_value = Status.ERR_MISSING_PARAMETERS
param_list = list()
for param in config.API_MANDATORY_PARAMS:
if req.get(param) is None:
param_list.append(param)
if len(param_list) > 0:
return ",".join(param_list), e_value
else:
return None, e_value
def check_job_type(req):
e_value = Status.ERR_INVALID_DATA
if req.get('type') in config.AGENT_SCRIPT_TYPES.keys():
return req.get('type'), e_value
else:
return None, e_value
class AgentListResource(Resource):
@basic_auth.required
def get(self):
try:
result = agentRepo.list(os.path.join(
config.SERVER_STATIC_PATH, config.AGENT_CONFIG_PKL_PATH))
result = agentRepo.list()
if result != None:
res = CustomResponse(Status.SUCCESS.value, result)
return res.getres()
......@@ -56,28 +34,19 @@ class AgentRunResource(Resource):
@basic_auth.required
def post(self):
try:
req = request.get_json()
# check mandatory params
miss, e_value = mandatory_param(req)
if miss is not None:
raise ValueMissing(miss+' - mandatory')
# check if valid JOB_TYPE
miss, e_value = check_job_type(req)
if miss is None:
raise ValueMissing('invalid type')
agentRunContext = AgentRunContext(req, miss)
result = agentRepo.run(agentRunContext, os.path.join(
config.SERVER_STATIC_PATH, config.AGENT_CONFIG_PKL_PATH))
req_data = request.get_json()
result = agentRepo.run(req_data)
if result != None:
res = CustomResponse(Status.SUCCESS.value, result)
return res.getres()
else:
res = CustomResponse(
Status.ERR_GLOBAL_INVALID_DATA.value, "Invalid Agent ID")
return res.getresjson(), 400
raise AgentError("Invalid Agent ID")
except Exception as e:
print(traceback.format_exc())
res = CustomResponse(e_value.value, str(e))
return res.getresjson(), 400
s_code = 400
e_class = str(type(e).__name__)
if e_class == 'TooManyRequest':
s_code = 429
res = CustomResponse(get_status(e_class).value, str(e))
return res.getresjson(), s_code
# Scrapy
from .applied_scrapy import AppliedScrapy
from .grainger_scrapy import GraingerScrapy
from .rs_scrapy import RSScrapy
# Selenium
from .applied_selenium import AppliedSelenium
from .grainger_selenium import GraingerSelenium
\ No newline at end of file
from .no_scripts import NoScripts
\ No newline at end of file
......@@ -9,18 +9,18 @@ from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
def AppliedSelenium(agentRunContext):
log = Log(agentRunContext)
def AppliedSelenium(agentContext):
log = agentContext.log
try:
url = 'https://www.applied.com/search?q=:relevance:FTS:' + \
agentRunContext.requestBody['search'] + \
agentContext.requestBody['search'] + \
'&page=<page>&search-category=all&override=true&isLevelUp=false'
download_dir_id = str(agentRunContext.jobId)
download_dir_id = str(agentContext.jobId)
download_dir = os.path.join(
os.getcwd(), 'temp', 'temp-' + download_dir_id)
driver = get_driver(download_dir)
driver = get_driver(download_dir, agentContext)
driver.maximize_window()
driver.get(url)
......@@ -28,7 +28,7 @@ def AppliedSelenium(agentRunContext):
log.job(config.JOB_RUNNING_STATUS, 'Job Started')
try:
wait(EC.element_to_be_clickable(
wait.until(EC.element_to_be_clickable(
(By.ID, "CybotCookiebotDialogBodyButtonAccept")))
driver.find_element_by_id(
"CybotCookiebotDialogBodyButtonAccept").click()
......@@ -37,8 +37,6 @@ def AppliedSelenium(agentRunContext):
for page_no in range(1, 1000):
driver.get(url.replace('<page>', str(page_no)))
time.sleep(2)
if 'page' not in driver.current_url:
break
wait.until(EC.presence_of_element_located(
(By.CLASS_NAME, 'product-list')))
......@@ -61,28 +59,37 @@ def AppliedSelenium(agentRunContext):
'item': driver.find_element_by_xpath('//div[@class="customer-part-number"]').text.strip()
}
item_dict['short_description'] = list()
des = driver.find_element_by_class_name('short-description')
for ele in des.find_elements_by_xpath('.//li'):
item_dict['short_description'].append(ele.text.strip())
try:
item_dict['short_description'] = list()
des = driver.find_element_by_class_name('short-description')
for ele in des.find_elements_by_xpath('.//li'):
item_dict['short_description'].append(ele.text.strip())
except:
log.info('info', 'No Short-Description Available for {0}'.format(item_dict['item_no']))
item_dict['specification'] = dict()
spe = driver.find_element_by_id('specifications')
for table in spe.find_elements_by_xpath('.//table'):
for tr_ele in table.find_elements_by_xpath('./tbody/tr'):
key = str(tr_ele.find_element_by_xpath(
'./td[1]').text).strip()
value = str(tr_ele.find_element_by_xpath(
'./td[2]').text).strip()
item_dict['specification'][key] = value
try:
item_dict['specification'] = dict()
spe = driver.find_element_by_id('specifications')
for table in spe.find_elements_by_xpath('.//table'):
for tr_ele in table.find_elements_by_xpath('./tbody/tr'):
key = str(tr_ele.find_element_by_xpath(
'./td[1]').text).strip()
value = str(tr_ele.find_element_by_xpath(
'./td[2]').text).strip()
item_dict['specification'][key] = value
except:
log.info('info', 'No Specification Available for {0}'.format(item_dict['item_no']))
print(item_dict['specification'])
try:
log.data(item_dict)
except:
pass
driver.close()
driver.switch_to.window(driver.window_handles[0])
if 'page' not in driver.current_url:
break
log.job(config.JOB_COMPLETED_SUCCESS_STATUS,
'Successfully scraped all data')
except Exception as e:
......
import config
import crochet
import scrapy
from common import Log
from common import Log,get_scrapy_settings
from scrapy.crawler import CrawlerRunner
from twisted.internet import reactor
crochet.setup()
# search_param=do630 voltage regulator (via category list)
# search_param=do 360 voltage (via product list)
# search_param=61HH68 (via direct product page)
......@@ -12,97 +15,95 @@ null = 'null'
true = 'true'
false = 'false'
# ---------------------SCRAPING-CLASS---------------------
def GraingerScrapy(agentRunContext):
class GraingerScrapy(scrapy.Spider):
name = 'GraingerScrapy'
log = Log(agentRunContext)
class GraingerScrapy(scrapy.Spider):
name = 'GraingerScrapy'
user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'
main_url = 'https://www.grainger.com/'
def __init__(self, search_param):
self.start_urls = [
"https://www.grainger.com/search?searchQuery="+search_param]
super().__init__()
def parse(self, response):
if 'search?' not in response.url:
yield scrapy.Request(url=response.url, callback=self.collect_data)
def __init__(self, search_param,eslog):
self.eslog = eslog
self.main_url = 'https://www.grainger.com/'
self.start_urls = [
"https://www.grainger.com/search?searchQuery="+search_param]
super().__init__()
def parse(self, response):
if 'search?' not in response.url:
yield scrapy.Request(url=response.url, callback=self.collect_data)
else:
if len(response.css('section[aria-label="Category products"]')) > 0:
script = [i.strip() for i in response.css('script::text').extract(
) if i.strip().startswith('window.__PRELOADED_STATE__')][0]
script = eval(script.split(
'=', 1)[-1].split('window.__UI_CONFIG__')[0].strip()[:-1])
products = list(script['category']['category']
['skuToProductMap'].keys())
href = '/product/info?productArray='+','.join(products)
yield scrapy.Request(url=self.main_url+href, callback=self.get_products)
else:
if len(response.css('section[aria-label="Category products"]')) > 0:
script = [i.strip() for i in response.css('script::text').extract(
) if i.strip().startswith('window.__PRELOADED_STATE__')][0]
script = eval(script.split(
'=', 1)[-1].split('window.__UI_CONFIG__')[0].strip()[:-1])
products = list(script['category']['category']
['skuToProductMap'].keys())
href = '/product/info?productArray='+','.join(products)
yield scrapy.Request(url=self.main_url+href, callback=self.get_products)
else:
# iterate every categories
for href in response.css('a.route::attr(href)').extract():
yield scrapy.Request(url=self.main_url+href, callback=self.parse_category_page)
def parse_category_page(self, response):
script = [i.strip() for i in response.css('script::text').extract(
) if i.strip().startswith('window.__PRELOADED_STATE__')][0]
script = eval(script.split('=', 1)
[-1].split('window.__UI_CONFIG__')[0].strip()[:-1])
cat_id = script['category']['category']['id']
for i in script['category']['collections']:
coll_id = i['id']
url1 = self.main_url + \
'/experience/pub/api/products/collection/{0}?categoryId={1}'
yield scrapy.Request(url=url1.format(coll_id, cat_id), callback=self.get_products)
def get_products(self, response):
data = response.json()
if 'products' in data.keys():
for i in data['products']:
# iterate every categories
for href in response.css('a.route::attr(href)').extract():
yield scrapy.Request(url=self.main_url+href, callback=self.parse_category_page)
def parse_category_page(self, response):
script = [i.strip() for i in response.css('script::text').extract(
) if i.strip().startswith('window.__PRELOADED_STATE__')][0]
script = eval(script.split('=', 1)
[-1].split('window.__UI_CONFIG__')[0].strip()[:-1])
cat_id = script['category']['category']['id']
for i in script['category']['collections']:
coll_id = i['id']
url1 = self.main_url + \
'/experience/pub/api/products/collection/{0}?categoryId={1}'
yield scrapy.Request(url=url1.format(coll_id, cat_id), callback=self.get_products)
def get_products(self, response):
data = response.json()
if 'products' in data.keys():
for i in data['products']:
yield scrapy.Request(url=self.main_url+i['productDetailUrl'], callback=self.collect_data)
else:
for i in data.values():
if type(i) == dict and 'productDetailUrl' in i.keys():
yield scrapy.Request(url=self.main_url+i['productDetailUrl'], callback=self.collect_data)
else:
for i in data.values():
if type(i) == dict and 'productDetailUrl' in i.keys():
yield scrapy.Request(url=self.main_url+i['productDetailUrl'], callback=self.collect_data)
def collect_data(self, response):
data = dict()
main_content = response.css('.product-detail__content--large')
spec = response.css('.specifications')
data = {
'brand': main_content.css('.product-detail__brand--link::text').get().strip(),
'product-heading': main_content.css('.product-detail__heading::text').get().strip(),
'url': response.url
}
for li in main_content.css('.product-detail__product-identifiers-content'):
key = li.css(
'.product-detail__product-identifiers-label::text').get().strip()
value = li.css(
'.product-detail__product-identifiers-description::text').extract()
value = [str(i).strip() for i in value] if len(
value) > 1 else str(value[0]).strip()
data[key] = value
for li in spec.css('.specifications__item'):
key = li.css('.specifications__description::text').get()
value = li.css('.specifications__value::text').extract()
value = [str(i).strip() for i in value] if len(
value) > 1 else str(value[0]).strip()
data[key] = value
log.data(data)
def collect_data(self, response):
data = dict()
main_content = response.css('.product-detail__content--large')
spec = response.css('.specifications')
data = {
'brand': main_content.css('.product-detail__brand--link::text').get().strip(),
'product-heading': main_content.css('.product-detail__heading::text').get().strip(),
'url': response.url
}
for li in main_content.css('.product-detail__product-identifiers-content'):
key = li.css(
'.product-detail__product-identifiers-label::text').get().strip()
value = li.css(
'.product-detail__product-identifiers-description::text').extract()
value = [str(i).strip() for i in value] if len(
value) > 1 else str(value[0]).strip()
data[key] = value
for li in spec.css('.specifications__item'):
key = li.css('.specifications__description::text').get()
value = li.css('.specifications__value::text').extract()
value = [str(i).strip() for i in value] if len(
value) > 1 else str(value[0]).strip()
data[key] = value
self.eslog.data(data)
# ---------------------SCRAPING-FUNCTION---------------------
def GraingerScrapy(agentContext):
log = Log(agentContext)
log.job(config.JOB_RUNNING_STATUS, 'Job Started')
runner = CrawlerRunner()
d = runner.crawl(
GraingerScrapy, search_param=agentRunContext.requestBody.get('search'))
d.addBoth(lambda _: reactor.stop())
reactor.run()
runner = CrawlerRunner(settings=get_scrapy_settings(agentContext.jobId))
runner.crawl(
GraingerScrapy, search_param=agentContext.requestBody.get('search'), eslog=log)
runner.join()
log.job(config.JOB_COMPLETED_SUCCESS_STATUS,
'Successfully scraped all data')
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment