ai_article_writer_web_ui/scrape_amazon.py

183 lines
6.9 KiB
Python

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
from selectorlib import Extractor
import re
import pyperclip
from openai import OpenAI
import os
class AmazonScraper:
def __init__(self):
PROXY_HOST = 'localhost'
PROXY_PORT = 1091
# self.images = []
self.proxy_dict = {
'http': f'socks5h://{PROXY_HOST}:{PROXY_PORT}',
'https': f'socks5h://{PROXY_HOST}:{PROXY_PORT}'
}
self.proxy_dict = {}
HEADERS = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5'
}
self.HEADERS = {
'authority': 'www.amazon.com',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'dnt': '1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'none',
'sec-fetch-mode': 'navigate',
'sec-fetch-dest': 'document',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
def get_real_url_from_shortlink(self, short_url):
response = requests.get(short_url, headers=self.HEADERS, proxies=self.proxy_dict)
return response.url
def extract_asin(self, product_url):
# Extract the ASIN from the product URL
match = re.search(r'/dp/([A-Z0-9]+)', product_url)
if match:
return match.group(1)
else:
return None
def generate_review_url(self, product_url):
base_review_url = "https://www.amazon.com/product-reviews/"
asin = self.extract_asin(product_url)
if asin:
review_url = f"{base_review_url}{asin}"
return review_url
else:
return None
def scrape_amazon_product(self, product_url):
product_url = self.get_real_url_from_shortlink(product_url)
response = requests.get(product_url, headers=self.HEADERS, proxies=self.proxy_dict)
if response.status_code > 500:
if "To discuss automated access to Amazon data please contact" in response.text:
print("Page %s was blocked by Amazon. Please try using better proxies\n" % product_url)
else:
print(
"Page %s must have been blocked by Amazon as the status code was %d" % (product_url, response.status_code))
return None
#
# soup = BeautifulSoup(response.content, 'html.parser')
#
# # Extract relevant information
# product_title = soup.find('span', {'id': 'productTitle'}).text.strip()
# product_rating = soup.find('span', {'class': 'a-icon-alt'}).text.strip()
# review_count = soup.find('span', {'id': 'acrCustomerReviewText'}).text.strip()
e = Extractor.from_yaml_file('product_selector.yml')
product_info = e.extract(response.text)
# Get link to reviews page
reviews_link = self.generate_review_url(product_url)
# Load the Selectorlib YAML file (selectors.yml)
# You can customize this file to specify which data fields to extract
# For example, review title, review content, rating, etc.
review_selector_file = "review_selector.yml"
e = Extractor.from_yaml_file(review_selector_file)
# Send an HTTP request to the review page
reviews_response = requests.get(reviews_link, headers=self.HEADERS, proxies=self.proxy_dict)
# print(reviews_response.text)
# Extract review data using the Selectorlib
review_data = e.extract(reviews_response.text)
print(review_data)
print(product_info)
print(product_info['images'], type(product_info['images']))
self.images = eval(product_info['images'])
print(self.images)
return {
'info': product_info,
'review texts': review_data # Get the first 3 reviews (you can adjust this as needed)
}
def get_product_info_and_reviews(self, product_url):
product_info = self.scrape_amazon_product(product_url)
# print(product_info)
name = product_info['info']['name']
description = product_info['info']['product_description'] if product_info['info'][
'product_description'] is not None else \
product_info['info']['short_description']
reviews = ""
for review in product_info['review texts']['reviews']:
# print("{}\n{}\n\n".format(review['title'], review['content']))
reviews += "{}\n{}\n\n".format(review['title'], review['content'])
return f"product name : {name}\ndescription : {description}\n\nreviews : \n{reviews}"
class AIInterface:
def __init__(self):
pass
def ask_ai(self, prompt, model="mistralai/Mixtral-8x7B-Instruct-v0.1"):
TOGETHER_API_KEY = "fbd3e65ce35bfa645e9ddc696f51dc705db8eb97a561ed61b52c6435b24bc175"
client = OpenAI(api_key=TOGETHER_API_KEY,
base_url='https://api.together.xyz',
)
# client._proxies
chat_completion = client.chat.completions.create(
messages=[
{
"role": "system",
"content": "You are an author of a popular product-review weblog",
},
{
"role": "user",
"content": prompt,
}
],
model=model,
max_tokens=4096
)
return chat_completion.choices[0].message.content
# Define the URL of the Amazon product page
# url = "https://www.amazon.com/Bark-Spark-Poo-Treats-Coprophagia/dp/B0CHZPFZL7/ref=zg_bsms_c_pet-supplies_d_sccl_3/143-8139391-6089832?pd_rd_w=KLu5Q&content-id=amzn1.sym.309d45c5-3eba-4f62-9bb2-0acdcf0662e7&pf_rd_p=309d45c5-3eba-4f62-9bb2-0acdcf0662e7&pf_rd_r=SYS7AW9XS89XM2EMRCFC&pd_rd_wg=wH6LW&pd_rd_r=b778cb5d-ec2b-4d58-9c0c-3799df0689fa&pd_rd_i=B0CVL3RZBX&psc=1"
#
# llms = ['meta-llama/Llama-2-70b-chat-hf', "mistralai/Mixtral-8x7B-Instruct-v0.1", "togethercomputer/LLaMA-2-7B-32K"]
#
# url = "https://amzn.to/3wd44FS"
#
# scraper = AmazonScraper()
# aii = AIInterface()
#
# text = scraper.get_product_info_and_reviews(url)
#
# prompt_for_ai = "write an expanded summary of the following product and an overview of people's experiences based on the provided reviews of it as follows. Format it nicely in markdown:\n\n" + text
#
#
# ai_response = aii.ask_ai(prompt_for_ai, model=llms[1])
#
# print(prompt_for_ai)
# print("The answer from AI:\n\n")
# print(ai_response)
#
# pyperclip.copy(ai_response)