import requests from bs4 import BeautifulSoup from urllib.parse import urljoin import pandas as pd from selectorlib import Extractor import re import pyperclip from openai import OpenAI import os class AmazonScraper: def __init__(self): PROXY_HOST = 'localhost' PROXY_PORT = 1091 # self.images = [] self.proxy_dict = { 'http': f'socks5h://{PROXY_HOST}:{PROXY_PORT}', 'https': f'socks5h://{PROXY_HOST}:{PROXY_PORT}' } self.proxy_dict = {} HEADERS = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5' } self.HEADERS = { 'authority': 'www.amazon.com', 'pragma': 'no-cache', 'cache-control': 'no-cache', 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'none', 'sec-fetch-mode': 'navigate', 'sec-fetch-dest': 'document', 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', } def get_real_url_from_shortlink(self, short_url): response = requests.get(short_url, headers=self.HEADERS, proxies=self.proxy_dict) return response.url def extract_asin(self, product_url): # Extract the ASIN from the product URL match = re.search(r'/dp/([A-Z0-9]+)', product_url) if match: return match.group(1) else: return None def generate_review_url(self, product_url): base_review_url = "https://www.amazon.com/product-reviews/" asin = self.extract_asin(product_url) if asin: review_url = f"{base_review_url}{asin}" return review_url else: return None def scrape_amazon_product(self, product_url): product_url = self.get_real_url_from_shortlink(product_url) response = requests.get(product_url, headers=self.HEADERS, proxies=self.proxy_dict) if response.status_code > 500: if "To discuss automated access to Amazon data please contact" in response.text: print("Page %s was blocked by Amazon. Please try using better proxies\n" % product_url) else: print( "Page %s must have been blocked by Amazon as the status code was %d" % (product_url, response.status_code)) return None # # soup = BeautifulSoup(response.content, 'html.parser') # # # Extract relevant information # product_title = soup.find('span', {'id': 'productTitle'}).text.strip() # product_rating = soup.find('span', {'class': 'a-icon-alt'}).text.strip() # review_count = soup.find('span', {'id': 'acrCustomerReviewText'}).text.strip() e = Extractor.from_yaml_file('product_selector.yml') product_info = e.extract(response.text) # Get link to reviews page reviews_link = self.generate_review_url(product_url) # Load the Selectorlib YAML file (selectors.yml) # You can customize this file to specify which data fields to extract # For example, review title, review content, rating, etc. review_selector_file = "review_selector.yml" e = Extractor.from_yaml_file(review_selector_file) # Send an HTTP request to the review page reviews_response = requests.get(reviews_link, headers=self.HEADERS, proxies=self.proxy_dict) # print(reviews_response.text) # Extract review data using the Selectorlib review_data = e.extract(reviews_response.text) print(review_data) print(product_info) print(product_info['images'], type(product_info['images'])) self.images = eval(product_info['images']) print(self.images) return { 'info': product_info, 'review texts': review_data # Get the first 3 reviews (you can adjust this as needed) } def get_product_info_and_reviews(self, product_url): product_info = self.scrape_amazon_product(product_url) # print(product_info) name = product_info['info']['name'] description = product_info['info']['product_description'] if product_info['info'][ 'product_description'] is not None else \ product_info['info']['short_description'] reviews = "" for review in product_info['review texts']['reviews']: # print("{}\n{}\n\n".format(review['title'], review['content'])) reviews += "{}\n{}\n\n".format(review['title'], review['content']) return f"product name : {name}\ndescription : {description}\n\nreviews : \n{reviews}" class AIInterface: def __init__(self): pass def ask_ai(self, prompt, model="mistralai/Mixtral-8x7B-Instruct-v0.1"): TOGETHER_API_KEY = "fbd3e65ce35bfa645e9ddc696f51dc705db8eb97a561ed61b52c6435b24bc175" client = OpenAI(api_key=TOGETHER_API_KEY, base_url='https://api.together.xyz', ) # client._proxies chat_completion = client.chat.completions.create( messages=[ { "role": "system", "content": "You are an author of a popular product-review weblog", }, { "role": "user", "content": prompt, } ], model=model, max_tokens=4096 ) return chat_completion.choices[0].message.content # Define the URL of the Amazon product page # url = "https://www.amazon.com/Bark-Spark-Poo-Treats-Coprophagia/dp/B0CHZPFZL7/ref=zg_bsms_c_pet-supplies_d_sccl_3/143-8139391-6089832?pd_rd_w=KLu5Q&content-id=amzn1.sym.309d45c5-3eba-4f62-9bb2-0acdcf0662e7&pf_rd_p=309d45c5-3eba-4f62-9bb2-0acdcf0662e7&pf_rd_r=SYS7AW9XS89XM2EMRCFC&pd_rd_wg=wH6LW&pd_rd_r=b778cb5d-ec2b-4d58-9c0c-3799df0689fa&pd_rd_i=B0CVL3RZBX&psc=1" # # llms = ['meta-llama/Llama-2-70b-chat-hf', "mistralai/Mixtral-8x7B-Instruct-v0.1", "togethercomputer/LLaMA-2-7B-32K"] # # url = "https://amzn.to/3wd44FS" # # scraper = AmazonScraper() # aii = AIInterface() # # text = scraper.get_product_info_and_reviews(url) # # prompt_for_ai = "write an expanded summary of the following product and an overview of people's experiences based on the provided reviews of it as follows. Format it nicely in markdown:\n\n" + text # # # ai_response = aii.ask_ai(prompt_for_ai, model=llms[1]) # # print(prompt_for_ai) # print("The answer from AI:\n\n") # print(ai_response) # # pyperclip.copy(ai_response)