diff --git a/product_selector.yml b/product_selector.yml new file mode 100644 index 0000000..61fefe8 --- /dev/null +++ b/product_selector.yml @@ -0,0 +1,41 @@ +name: + css: '#productTitle' + type: Text +price: + css: '#price_inside_buybox' + type: Text +short_description: + css: '#featurebullets_feature_div' + type: Text +images: + css: '.imgTagWrapper img' + type: Attribute + attribute: data-a-dynamic-image +rating: + css: span.arp-rating-out-of-text + type: Text +number_of_reviews: + css: 'a.a-link-normal h2' + type: Text +variants: + css: 'form.a-section li' + multiple: true + type: Text + children: + name: + css: "" + type: Attribute + attribute: title + asin: + css: "" + type: Attribute + attribute: data-defaultasin +product_description: + css: '#productDescription' + type: Text +sales_rank: + css: 'li#SalesRank' + type: Text +link_to_all_reviews: + css: 'div.card-padding a.a-link-emphasis' + type: Link \ No newline at end of file diff --git a/review_selector.yml b/review_selector.yml new file mode 100644 index 0000000..b646a22 --- /dev/null +++ b/review_selector.yml @@ -0,0 +1,38 @@ +product_title: + css: 'h1 a[data-hook="product-link"]' + type: Text +reviews: + css: 'div.review div.a-section.celwidget' + multiple: true + type: Text + children: + title: + css: a.review-title + type: Text + content: + css: 'div.a-row.review-data span.review-text' + type: Text + date: + css: span.a-size-base.a-color-secondary + type: Text + variant: + css: 'a.a-size-mini' + type: Text + images: + css: img.review-image-tile + multiple: true + type: Attribute + attribute: src + verified: + css: 'span[data-hook="avp-badge"]' + type: Text + author: + css: span.a-profile-name + type: Text + rating: + css: 'div.a-row:nth-of-type(2) > a.a-link-normal:nth-of-type(1)' + type: Attribute + attribute: title +next_page: + css: 'li.a-last a' + type: Link \ No newline at end of file diff --git a/scrape_amazon.py b/scrape_amazon.py index d364fa5..64b3839 100644 --- a/scrape_amazon.py +++ b/scrape_amazon.py @@ -9,155 +9,168 @@ from openai import OpenAI import os -PROXY_HOST = 'localhost' -PROXY_PORT = 1091 +class AmazonScraper: + def __init__(self): + PROXY_HOST = 'localhost' + PROXY_PORT = 1091 + # self.images = [] -proxy_dict = { - 'http': f'socks5h://{PROXY_HOST}:{PROXY_PORT}', - 'https': f'socks5h://{PROXY_HOST}:{PROXY_PORT}' -} + self.proxy_dict = { + 'http': f'socks5h://{PROXY_HOST}:{PROXY_PORT}', + 'https': f'socks5h://{PROXY_HOST}:{PROXY_PORT}' + } -HEADERS = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', - 'Accept-Language': 'en-US, en;q=0.5' -} + HEADERS = { + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', + 'Accept-Language': 'en-US, en;q=0.5' + } -HEADERS = { - 'authority': 'www.amazon.com', - 'pragma': 'no-cache', - 'cache-control': 'no-cache', - 'dnt': '1', - 'upgrade-insecure-requests': '1', - 'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36', - 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', - 'sec-fetch-site': 'none', - 'sec-fetch-mode': 'navigate', - 'sec-fetch-dest': 'document', - 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', - } + self.HEADERS = { + 'authority': 'www.amazon.com', + 'pragma': 'no-cache', + 'cache-control': 'no-cache', + 'dnt': '1', + 'upgrade-insecure-requests': '1', + 'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36', + 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', + 'sec-fetch-site': 'none', + 'sec-fetch-mode': 'navigate', + 'sec-fetch-dest': 'document', + 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', + } + def get_real_url_from_shortlink(self, short_url): + response = requests.get(short_url, headers=self.HEADERS, proxies=self.proxy_dict) + return response.url -def get_real_url_from_shortlink(short_url): - response = requests.get(short_url, headers=HEADERS, proxies=proxy_dict) - return response.url - - -def extract_asin(product_url): - # Extract the ASIN from the product URL - match = re.search(r'/dp/([A-Z0-9]+)', product_url) - if match: - return match.group(1) - else: - return None - -def generate_review_url(product_url): - base_review_url = "https://www.amazon.com/product-reviews/" - asin = extract_asin(product_url) - if asin: - review_url = f"{base_review_url}{asin}" - return review_url - else: - return None - -def scrape_amazon_product(product_url): - product_url = get_real_url_from_shortlink(product_url) - response = requests.get(product_url, headers=HEADERS, proxies=proxy_dict) - - if response.status_code > 500: - if "To discuss automated access to Amazon data please contact" in response.text: - print("Page %s was blocked by Amazon. Please try using better proxies\n" % url) + def extract_asin(self, product_url): + # Extract the ASIN from the product URL + match = re.search(r'/dp/([A-Z0-9]+)', product_url) + if match: + return match.group(1) else: - print("Page %s must have been blocked by Amazon as the status code was %d" % (url, response.status_code)) - return None - # - # soup = BeautifulSoup(response.content, 'html.parser') - # - # # Extract relevant information - # product_title = soup.find('span', {'id': 'productTitle'}).text.strip() - # product_rating = soup.find('span', {'class': 'a-icon-alt'}).text.strip() - # review_count = soup.find('span', {'id': 'acrCustomerReviewText'}).text.strip() + return None - e = Extractor.from_yaml_file('product_selector.yml') - product_info = e.extract(response.text) - # Get link to reviews page - reviews_link = generate_review_url(product_url) + def generate_review_url(self, product_url): + base_review_url = "https://www.amazon.com/product-reviews/" + asin = self.extract_asin(product_url) + if asin: + review_url = f"{base_review_url}{asin}" + return review_url + else: + return None - # Load the Selectorlib YAML file (selectors.yml) - # You can customize this file to specify which data fields to extract - # For example, review title, review content, rating, etc. - review_selector_file = "review_selector.yml" - e = Extractor.from_yaml_file(review_selector_file) + def scrape_amazon_product(self, product_url): + product_url = self.get_real_url_from_shortlink(product_url) + response = requests.get(product_url, headers=self.HEADERS, proxies=self.proxy_dict) - # Send an HTTP request to the review page - reviews_response = requests.get(reviews_link, headers=HEADERS, proxies=proxy_dict) + if response.status_code > 500: + if "To discuss automated access to Amazon data please contact" in response.text: + print("Page %s was blocked by Amazon. Please try using better proxies\n" % product_url) + else: + print( + "Page %s must have been blocked by Amazon as the status code was %d" % (product_url, response.status_code)) + return None + # + # soup = BeautifulSoup(response.content, 'html.parser') + # + # # Extract relevant information + # product_title = soup.find('span', {'id': 'productTitle'}).text.strip() + # product_rating = soup.find('span', {'class': 'a-icon-alt'}).text.strip() + # review_count = soup.find('span', {'id': 'acrCustomerReviewText'}).text.strip() - # print(reviews_response.text) - # Extract review data using the Selectorlib - review_data = e.extract(reviews_response.text) + e = Extractor.from_yaml_file('product_selector.yml') + product_info = e.extract(response.text) + # Get link to reviews page + reviews_link = self.generate_review_url(product_url) - return { - # 'Title': product_title, - # 'Rating': product_rating, - # 'Reviews': review_count, - # 'Reviews Link': reviews_link, - 'info': product_info, - 'review texts': review_data # Get the first 3 reviews (you can adjust this as needed) - } + # Load the Selectorlib YAML file (selectors.yml) + # You can customize this file to specify which data fields to extract + # For example, review title, review content, rating, etc. + review_selector_file = "review_selector.yml" + e = Extractor.from_yaml_file(review_selector_file) -def get_product_info_and_reviews(product_url): - product_info = scrape_amazon_product(url) - # print(product_info) - name = product_info['info']['name'] - description = product_info['info']['product_description'] if product_info['info']['product_description'] is not None else product_info['info']['short_description'] - reviews = "" - for review in product_info['review texts']['reviews']: - # print("{}\n{}\n\n".format(review['title'], review['content'])) - reviews += "{}\n{}\n\n".format(review['title'], review['content']) + # Send an HTTP request to the review page + reviews_response = requests.get(reviews_link, headers=self.HEADERS, proxies=self.proxy_dict) - return f"product name : {name}\ndescription : {description}\n\nreviews : \n{reviews}" + # print(reviews_response.text) + # Extract review data using the Selectorlib + review_data = e.extract(reviews_response.text) + + print(review_data) + print(product_info) + print(product_info['images'], type(product_info['images'])) + self.images = eval(product_info['images']).keys() + print(self.images) -def ask_ai(prompt, model="mistralai/Mixtral-8x7B-Instruct-v0.1"): - TOGETHER_API_KEY = "fbd3e65ce35bfa645e9ddc696f51dc705db8eb97a561ed61b52c6435b24bc175" + return { + 'info': product_info, + 'review texts': review_data # Get the first 3 reviews (you can adjust this as needed) + } - client = OpenAI(api_key=TOGETHER_API_KEY, - base_url='https://api.together.xyz', - ) + def get_product_info_and_reviews(self, product_url): + product_info = self.scrape_amazon_product(product_url) + # print(product_info) + name = product_info['info']['name'] + description = product_info['info']['product_description'] if product_info['info'][ + 'product_description'] is not None else \ + product_info['info']['short_description'] + reviews = "" + for review in product_info['review texts']['reviews']: + # print("{}\n{}\n\n".format(review['title'], review['content'])) + reviews += "{}\n{}\n\n".format(review['title'], review['content']) + + return f"product name : {name}\ndescription : {description}\n\nreviews : \n{reviews}" + + +class AIInterface: + def __init__(self): + pass + + def ask_ai(self, prompt, model="mistralai/Mixtral-8x7B-Instruct-v0.1"): + TOGETHER_API_KEY = "fbd3e65ce35bfa645e9ddc696f51dc705db8eb97a561ed61b52c6435b24bc175" + + client = OpenAI(api_key=TOGETHER_API_KEY, + base_url='https://api.together.xyz', + ) + + chat_completion = client.chat.completions.create( + messages=[ + { + "role": "system", + "content": "You are an author of a popular product-review weblog", + }, + { + "role": "user", + "content": prompt, + } + ], + model=model, + max_tokens=4096 + ) + return chat_completion.choices[0].message.content - chat_completion = client.chat.completions.create( - messages=[ - { - "role": "system", - "content": "You are an author of a popular product-review weblog", - }, - { - "role": "user", - "content": prompt_for_ai, - } - ], - model=model, - max_tokens=4096 - ) - return chat_completion.choices[0].message.content # Define the URL of the Amazon product page # url = "https://www.amazon.com/Bark-Spark-Poo-Treats-Coprophagia/dp/B0CHZPFZL7/ref=zg_bsms_c_pet-supplies_d_sccl_3/143-8139391-6089832?pd_rd_w=KLu5Q&content-id=amzn1.sym.309d45c5-3eba-4f62-9bb2-0acdcf0662e7&pf_rd_p=309d45c5-3eba-4f62-9bb2-0acdcf0662e7&pf_rd_r=SYS7AW9XS89XM2EMRCFC&pd_rd_wg=wH6LW&pd_rd_r=b778cb5d-ec2b-4d58-9c0c-3799df0689fa&pd_rd_i=B0CVL3RZBX&psc=1" - -llms = ['meta-llama/Llama-2-70b-chat-hf', "mistralai/Mixtral-8x7B-Instruct-v0.1", "togethercomputer/LLaMA-2-7B-32K"] - -url = "https://amzn.to/3wd44FS" - -text = get_product_info_and_reviews(url) - -prompt_for_ai = "write an expanded summary of the following product and an overview of people's experiences based on the provided reviews of it as follows. Format it nicely in markdown:\n\n" + text - +# +# llms = ['meta-llama/Llama-2-70b-chat-hf', "mistralai/Mixtral-8x7B-Instruct-v0.1", "togethercomputer/LLaMA-2-7B-32K"] +# +# url = "https://amzn.to/3wd44FS" +# +# scraper = AmazonScraper() +# aii = AIInterface() +# +# text = scraper.get_product_info_and_reviews(url) +# +# prompt_for_ai = "write an expanded summary of the following product and an overview of people's experiences based on the provided reviews of it as follows. Format it nicely in markdown:\n\n" + text +# +# +# ai_response = aii.ask_ai(prompt_for_ai, model=llms[1]) +# # print(prompt_for_ai) - -pyperclip.copy(prompt_for_ai) - - -ai_response = ask_ai(prompt_for_ai, model=llms[1]) -print("The answer from AI:\n\n") -print(ai_response) - -pyperclip.copy(ai_response) \ No newline at end of file +# print("The answer from AI:\n\n") +# print(ai_response) +# +# pyperclip.copy(ai_response) diff --git a/webui.py b/webui.py index 8587db8..4b90610 100644 --- a/webui.py +++ b/webui.py @@ -1,21 +1,37 @@ import gradio as gr +from scrape_amazon import AmazonScraper, AIInterface + +llms = ['meta-llama/Llama-2-70b-chat-hf', "mistralai/Mixtral-8x7B-Instruct-v0.1", "togethercomputer/LLaMA-2-7B-32K"] +scraper = AmazonScraper() +aii = AIInterface() -def write_article(url): +def write_article(url, ai_prompt): # Your logic to fetch HTML content from the URL # Replace this with your actual implementation - html_content = f"