first commit

This commit is contained in:
shahab00x 2024-02-18 03:56:20 +03:30
parent ec9a6a3b81
commit edd3fcaca5
4 changed files with 243 additions and 135 deletions

41
product_selector.yml Normal file
View File

@ -0,0 +1,41 @@
name:
css: '#productTitle'
type: Text
price:
css: '#price_inside_buybox'
type: Text
short_description:
css: '#featurebullets_feature_div'
type: Text
images:
css: '.imgTagWrapper img'
type: Attribute
attribute: data-a-dynamic-image
rating:
css: span.arp-rating-out-of-text
type: Text
number_of_reviews:
css: 'a.a-link-normal h2'
type: Text
variants:
css: 'form.a-section li'
multiple: true
type: Text
children:
name:
css: ""
type: Attribute
attribute: title
asin:
css: ""
type: Attribute
attribute: data-defaultasin
product_description:
css: '#productDescription'
type: Text
sales_rank:
css: 'li#SalesRank'
type: Text
link_to_all_reviews:
css: 'div.card-padding a.a-link-emphasis'
type: Link

38
review_selector.yml Normal file
View File

@ -0,0 +1,38 @@
product_title:
css: 'h1 a[data-hook="product-link"]'
type: Text
reviews:
css: 'div.review div.a-section.celwidget'
multiple: true
type: Text
children:
title:
css: a.review-title
type: Text
content:
css: 'div.a-row.review-data span.review-text'
type: Text
date:
css: span.a-size-base.a-color-secondary
type: Text
variant:
css: 'a.a-size-mini'
type: Text
images:
css: img.review-image-tile
multiple: true
type: Attribute
attribute: src
verified:
css: 'span[data-hook="avp-badge"]'
type: Text
author:
css: span.a-profile-name
type: Text
rating:
css: 'div.a-row:nth-of-type(2) > a.a-link-normal:nth-of-type(1)'
type: Attribute
attribute: title
next_page:
css: 'li.a-last a'
type: Link

View File

@ -9,20 +9,23 @@ from openai import OpenAI
import os import os
PROXY_HOST = 'localhost' class AmazonScraper:
PROXY_PORT = 1091 def __init__(self):
PROXY_HOST = 'localhost'
PROXY_PORT = 1091
# self.images = []
proxy_dict = { self.proxy_dict = {
'http': f'socks5h://{PROXY_HOST}:{PROXY_PORT}', 'http': f'socks5h://{PROXY_HOST}:{PROXY_PORT}',
'https': f'socks5h://{PROXY_HOST}:{PROXY_PORT}' 'https': f'socks5h://{PROXY_HOST}:{PROXY_PORT}'
} }
HEADERS = { HEADERS = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5' 'Accept-Language': 'en-US, en;q=0.5'
} }
HEADERS = { self.HEADERS = {
'authority': 'www.amazon.com', 'authority': 'www.amazon.com',
'pragma': 'no-cache', 'pragma': 'no-cache',
'cache-control': 'no-cache', 'cache-control': 'no-cache',
@ -36,13 +39,11 @@ HEADERS = {
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
} }
def get_real_url_from_shortlink(self, short_url):
def get_real_url_from_shortlink(short_url): response = requests.get(short_url, headers=self.HEADERS, proxies=self.proxy_dict)
response = requests.get(short_url, headers=HEADERS, proxies=proxy_dict)
return response.url return response.url
def extract_asin(self, product_url):
def extract_asin(product_url):
# Extract the ASIN from the product URL # Extract the ASIN from the product URL
match = re.search(r'/dp/([A-Z0-9]+)', product_url) match = re.search(r'/dp/([A-Z0-9]+)', product_url)
if match: if match:
@ -50,24 +51,25 @@ def extract_asin(product_url):
else: else:
return None return None
def generate_review_url(product_url): def generate_review_url(self, product_url):
base_review_url = "https://www.amazon.com/product-reviews/" base_review_url = "https://www.amazon.com/product-reviews/"
asin = extract_asin(product_url) asin = self.extract_asin(product_url)
if asin: if asin:
review_url = f"{base_review_url}{asin}" review_url = f"{base_review_url}{asin}"
return review_url return review_url
else: else:
return None return None
def scrape_amazon_product(product_url): def scrape_amazon_product(self, product_url):
product_url = get_real_url_from_shortlink(product_url) product_url = self.get_real_url_from_shortlink(product_url)
response = requests.get(product_url, headers=HEADERS, proxies=proxy_dict) response = requests.get(product_url, headers=self.HEADERS, proxies=self.proxy_dict)
if response.status_code > 500: if response.status_code > 500:
if "To discuss automated access to Amazon data please contact" in response.text: if "To discuss automated access to Amazon data please contact" in response.text:
print("Page %s was blocked by Amazon. Please try using better proxies\n" % url) print("Page %s was blocked by Amazon. Please try using better proxies\n" % product_url)
else: else:
print("Page %s must have been blocked by Amazon as the status code was %d" % (url, response.status_code)) print(
"Page %s must have been blocked by Amazon as the status code was %d" % (product_url, response.status_code))
return None return None
# #
# soup = BeautifulSoup(response.content, 'html.parser') # soup = BeautifulSoup(response.content, 'html.parser')
@ -80,7 +82,7 @@ def scrape_amazon_product(product_url):
e = Extractor.from_yaml_file('product_selector.yml') e = Extractor.from_yaml_file('product_selector.yml')
product_info = e.extract(response.text) product_info = e.extract(response.text)
# Get link to reviews page # Get link to reviews page
reviews_link = generate_review_url(product_url) reviews_link = self.generate_review_url(product_url)
# Load the Selectorlib YAML file (selectors.yml) # Load the Selectorlib YAML file (selectors.yml)
# You can customize this file to specify which data fields to extract # You can customize this file to specify which data fields to extract
@ -89,26 +91,31 @@ def scrape_amazon_product(product_url):
e = Extractor.from_yaml_file(review_selector_file) e = Extractor.from_yaml_file(review_selector_file)
# Send an HTTP request to the review page # Send an HTTP request to the review page
reviews_response = requests.get(reviews_link, headers=HEADERS, proxies=proxy_dict) reviews_response = requests.get(reviews_link, headers=self.HEADERS, proxies=self.proxy_dict)
# print(reviews_response.text) # print(reviews_response.text)
# Extract review data using the Selectorlib # Extract review data using the Selectorlib
review_data = e.extract(reviews_response.text) review_data = e.extract(reviews_response.text)
print(review_data)
print(product_info)
print(product_info['images'], type(product_info['images']))
self.images = eval(product_info['images']).keys()
print(self.images)
return { return {
# 'Title': product_title,
# 'Rating': product_rating,
# 'Reviews': review_count,
# 'Reviews Link': reviews_link,
'info': product_info, 'info': product_info,
'review texts': review_data # Get the first 3 reviews (you can adjust this as needed) 'review texts': review_data # Get the first 3 reviews (you can adjust this as needed)
} }
def get_product_info_and_reviews(product_url): def get_product_info_and_reviews(self, product_url):
product_info = scrape_amazon_product(url) product_info = self.scrape_amazon_product(product_url)
# print(product_info) # print(product_info)
name = product_info['info']['name'] name = product_info['info']['name']
description = product_info['info']['product_description'] if product_info['info']['product_description'] is not None else product_info['info']['short_description'] description = product_info['info']['product_description'] if product_info['info'][
'product_description'] is not None else \
product_info['info']['short_description']
reviews = "" reviews = ""
for review in product_info['review texts']['reviews']: for review in product_info['review texts']['reviews']:
# print("{}\n{}\n\n".format(review['title'], review['content'])) # print("{}\n{}\n\n".format(review['title'], review['content']))
@ -117,7 +124,11 @@ def get_product_info_and_reviews(product_url):
return f"product name : {name}\ndescription : {description}\n\nreviews : \n{reviews}" return f"product name : {name}\ndescription : {description}\n\nreviews : \n{reviews}"
def ask_ai(prompt, model="mistralai/Mixtral-8x7B-Instruct-v0.1"): class AIInterface:
def __init__(self):
pass
def ask_ai(self, prompt, model="mistralai/Mixtral-8x7B-Instruct-v0.1"):
TOGETHER_API_KEY = "fbd3e65ce35bfa645e9ddc696f51dc705db8eb97a561ed61b52c6435b24bc175" TOGETHER_API_KEY = "fbd3e65ce35bfa645e9ddc696f51dc705db8eb97a561ed61b52c6435b24bc175"
client = OpenAI(api_key=TOGETHER_API_KEY, client = OpenAI(api_key=TOGETHER_API_KEY,
@ -132,7 +143,7 @@ def ask_ai(prompt, model="mistralai/Mixtral-8x7B-Instruct-v0.1"):
}, },
{ {
"role": "user", "role": "user",
"content": prompt_for_ai, "content": prompt,
} }
], ],
model=model, model=model,
@ -140,24 +151,26 @@ def ask_ai(prompt, model="mistralai/Mixtral-8x7B-Instruct-v0.1"):
) )
return chat_completion.choices[0].message.content return chat_completion.choices[0].message.content
# Define the URL of the Amazon product page # Define the URL of the Amazon product page
# url = "https://www.amazon.com/Bark-Spark-Poo-Treats-Coprophagia/dp/B0CHZPFZL7/ref=zg_bsms_c_pet-supplies_d_sccl_3/143-8139391-6089832?pd_rd_w=KLu5Q&content-id=amzn1.sym.309d45c5-3eba-4f62-9bb2-0acdcf0662e7&pf_rd_p=309d45c5-3eba-4f62-9bb2-0acdcf0662e7&pf_rd_r=SYS7AW9XS89XM2EMRCFC&pd_rd_wg=wH6LW&pd_rd_r=b778cb5d-ec2b-4d58-9c0c-3799df0689fa&pd_rd_i=B0CVL3RZBX&psc=1" # url = "https://www.amazon.com/Bark-Spark-Poo-Treats-Coprophagia/dp/B0CHZPFZL7/ref=zg_bsms_c_pet-supplies_d_sccl_3/143-8139391-6089832?pd_rd_w=KLu5Q&content-id=amzn1.sym.309d45c5-3eba-4f62-9bb2-0acdcf0662e7&pf_rd_p=309d45c5-3eba-4f62-9bb2-0acdcf0662e7&pf_rd_r=SYS7AW9XS89XM2EMRCFC&pd_rd_wg=wH6LW&pd_rd_r=b778cb5d-ec2b-4d58-9c0c-3799df0689fa&pd_rd_i=B0CVL3RZBX&psc=1"
#
llms = ['meta-llama/Llama-2-70b-chat-hf', "mistralai/Mixtral-8x7B-Instruct-v0.1", "togethercomputer/LLaMA-2-7B-32K"] # llms = ['meta-llama/Llama-2-70b-chat-hf', "mistralai/Mixtral-8x7B-Instruct-v0.1", "togethercomputer/LLaMA-2-7B-32K"]
#
url = "https://amzn.to/3wd44FS" # url = "https://amzn.to/3wd44FS"
#
text = get_product_info_and_reviews(url) # scraper = AmazonScraper()
# aii = AIInterface()
prompt_for_ai = "write an expanded summary of the following product and an overview of people's experiences based on the provided reviews of it as follows. Format it nicely in markdown:\n\n" + text #
# text = scraper.get_product_info_and_reviews(url)
#
# prompt_for_ai = "write an expanded summary of the following product and an overview of people's experiences based on the provided reviews of it as follows. Format it nicely in markdown:\n\n" + text
#
#
# ai_response = aii.ask_ai(prompt_for_ai, model=llms[1])
#
# print(prompt_for_ai) # print(prompt_for_ai)
# print("The answer from AI:\n\n")
pyperclip.copy(prompt_for_ai) # print(ai_response)
#
# pyperclip.copy(ai_response)
ai_response = ask_ai(prompt_for_ai, model=llms[1])
print("The answer from AI:\n\n")
print(ai_response)
pyperclip.copy(ai_response)

View File

@ -1,21 +1,37 @@
import gradio as gr import gradio as gr
from scrape_amazon import AmazonScraper, AIInterface
llms = ['meta-llama/Llama-2-70b-chat-hf', "mistralai/Mixtral-8x7B-Instruct-v0.1", "togethercomputer/LLaMA-2-7B-32K"]
scraper = AmazonScraper()
aii = AIInterface()
def write_article(url): def write_article(url, ai_prompt):
# Your logic to fetch HTML content from the URL # Your logic to fetch HTML content from the URL
# Replace this with your actual implementation # Replace this with your actual implementation
html_content = f"<h1>Sample HTML Content for {url}</h1>"
text = scraper.get_product_info_and_reviews(url)
images = list(scraper.images)[0]
prompt_for_ai = "Write a summary of the following product and an overview of people's experiences based on the provided reviews of it as follows. Format it nicely and professionally in HTML:\n\n" + text
# prompt_for_ai = f"Write a summary of the following product and an overview of people's experiences based on the provided reviews of it as follows. Format it nicely and professionally in HTML. The title of this product should links to {url}. Also include this image {images} after the first or second paragraph as a link to {url} and <figcaption>Image from Amazon.com</figcaption>:\n\n" + text
ai_response = aii.ask_ai(prompt_for_ai, model=llms[1])
print(ai_response)
html_content = ai_response
prompt_for_ai = f"Take the following HTML code and slightly modify it by converting the names of this product to links to {url}. Also include this image {images} after the first or second paragraph as a link to {url} and caption it with <figcaption>Image from Amazon.com</figcaption>. Return a nice and professional HTML code:\n" + ai_response
html_content = aii.ask_ai(prompt_for_ai, model=llms[1])
print(html_content)
return html_content return html_content
# Define the Gradio interface # Define the Gradio interface
iface = gr.Interface( iface = gr.Interface(
fn=write_article, fn=write_article,
inputs="text", # Text input for the URL inputs=["text", gr.components.Textbox(lines=10, placeholder="Enter AI prompt here...", label="AI Prompt:")], # Text input for the URL
outputs="html", # Display HTML content outputs="html", # Display HTML content
title="URL to HTML Converter", title="URL to HTML Converter",
description="Enter a URL to get its HTML content." description="Enter a URL to get its HTML content."
) )
# Launch the Gradio app # Launch the Gradio app
iface.launch(server_port=7373) iface.launch(server_port=7373, share=True)