From ec9a6a3b816e36d4c65b795a1ddbb379af129014 Mon Sep 17 00:00:00 2001
From: shahab00x <shahab.feb.07.1988@gmail.com>
Date: Sat, 17 Feb 2024 22:04:06 +0330
Subject: [PATCH] first commit

---
 scrape_amazon.py | 163 +++++++++++++++++++++++++++++++++++++++++++++++
 webui.py         |  21 ++++++
 2 files changed, 184 insertions(+)
 create mode 100644 scrape_amazon.py
 create mode 100644 webui.py

diff --git a/scrape_amazon.py b/scrape_amazon.py
new file mode 100644
index 0000000..d364fa5
--- /dev/null
+++ b/scrape_amazon.py
@@ -0,0 +1,163 @@
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+import pandas as pd
+from selectorlib import Extractor
+import re
+import pyperclip
+from openai import OpenAI
+import os
+
+
+PROXY_HOST = 'localhost'
+PROXY_PORT = 1091
+
+proxy_dict = {
+    'http': f'socks5h://{PROXY_HOST}:{PROXY_PORT}',
+    'https': f'socks5h://{PROXY_HOST}:{PROXY_PORT}'
+}
+
+HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
+    'Accept-Language': 'en-US, en;q=0.5'
+}
+
+HEADERS = {
+        'authority': 'www.amazon.com',
+        'pragma': 'no-cache',
+        'cache-control': 'no-cache',
+        'dnt': '1',
+        'upgrade-insecure-requests': '1',
+        'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
+        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
+        'sec-fetch-site': 'none',
+        'sec-fetch-mode': 'navigate',
+        'sec-fetch-dest': 'document',
+        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
+    }
+
+
+def get_real_url_from_shortlink(short_url):
+    response = requests.get(short_url, headers=HEADERS, proxies=proxy_dict)
+    return response.url
+
+
+def extract_asin(product_url):
+    # Extract the ASIN from the product URL
+    match = re.search(r'/dp/([A-Z0-9]+)', product_url)
+    if match:
+        return match.group(1)
+    else:
+        return None
+
+def generate_review_url(product_url):
+    base_review_url = "https://www.amazon.com/product-reviews/"
+    asin = extract_asin(product_url)
+    if asin:
+        review_url = f"{base_review_url}{asin}"
+        return review_url
+    else:
+        return None
+
+def scrape_amazon_product(product_url):
+    product_url = get_real_url_from_shortlink(product_url)
+    response = requests.get(product_url, headers=HEADERS, proxies=proxy_dict)
+
+    if response.status_code > 500:
+        if "To discuss automated access to Amazon data please contact" in response.text:
+            print("Page %s was blocked by Amazon. Please try using better proxies\n" % url)
+        else:
+            print("Page %s must have been blocked by Amazon as the status code was %d" % (url, response.status_code))
+        return None
+    #
+    # soup = BeautifulSoup(response.content, 'html.parser')
+    #
+    # # Extract relevant information
+    # product_title = soup.find('span', {'id': 'productTitle'}).text.strip()
+    # product_rating = soup.find('span', {'class': 'a-icon-alt'}).text.strip()
+    # review_count = soup.find('span', {'id': 'acrCustomerReviewText'}).text.strip()
+
+    e = Extractor.from_yaml_file('product_selector.yml')
+    product_info = e.extract(response.text)
+    # Get link to reviews page
+    reviews_link = generate_review_url(product_url)
+
+    # Load the Selectorlib YAML file (selectors.yml)
+    # You can customize this file to specify which data fields to extract
+    # For example, review title, review content, rating, etc.
+    review_selector_file = "review_selector.yml"
+    e = Extractor.from_yaml_file(review_selector_file)
+
+    # Send an HTTP request to the review page
+    reviews_response = requests.get(reviews_link, headers=HEADERS, proxies=proxy_dict)
+
+    # print(reviews_response.text)
+    # Extract review data using the Selectorlib
+    review_data = e.extract(reviews_response.text)
+
+    return {
+        # 'Title': product_title,
+        # 'Rating': product_rating,
+        # 'Reviews': review_count,
+        # 'Reviews Link': reviews_link,
+        'info': product_info,
+        'review texts': review_data  # Get the first 3 reviews (you can adjust this as needed)
+    }
+
+def get_product_info_and_reviews(product_url):
+    product_info = scrape_amazon_product(url)
+    # print(product_info)
+    name = product_info['info']['name']
+    description = product_info['info']['product_description'] if product_info['info']['product_description'] is not None else product_info['info']['short_description']
+    reviews = ""
+    for review in product_info['review texts']['reviews']:
+        # print("{}\n{}\n\n".format(review['title'], review['content']))
+        reviews += "{}\n{}\n\n".format(review['title'], review['content'])
+
+    return f"product name : {name}\ndescription : {description}\n\nreviews : \n{reviews}"
+
+
+def ask_ai(prompt, model="mistralai/Mixtral-8x7B-Instruct-v0.1"):
+    TOGETHER_API_KEY = "fbd3e65ce35bfa645e9ddc696f51dc705db8eb97a561ed61b52c6435b24bc175"
+
+    client = OpenAI(api_key=TOGETHER_API_KEY,
+                    base_url='https://api.together.xyz',
+                    )
+
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {
+                "role": "system",
+                "content": "You are an author of a popular product-review weblog",
+            },
+            {
+                "role": "user",
+                "content": prompt_for_ai,
+            }
+        ],
+        model=model,
+        max_tokens=4096
+    )
+    return chat_completion.choices[0].message.content
+
+# Define the URL of the Amazon product page
+# url = "https://www.amazon.com/Bark-Spark-Poo-Treats-Coprophagia/dp/B0CHZPFZL7/ref=zg_bsms_c_pet-supplies_d_sccl_3/143-8139391-6089832?pd_rd_w=KLu5Q&content-id=amzn1.sym.309d45c5-3eba-4f62-9bb2-0acdcf0662e7&pf_rd_p=309d45c5-3eba-4f62-9bb2-0acdcf0662e7&pf_rd_r=SYS7AW9XS89XM2EMRCFC&pd_rd_wg=wH6LW&pd_rd_r=b778cb5d-ec2b-4d58-9c0c-3799df0689fa&pd_rd_i=B0CVL3RZBX&psc=1"
+
+llms = ['meta-llama/Llama-2-70b-chat-hf', "mistralai/Mixtral-8x7B-Instruct-v0.1", "togethercomputer/LLaMA-2-7B-32K"]
+
+url = "https://amzn.to/3wd44FS"
+
+text = get_product_info_and_reviews(url)
+
+prompt_for_ai = "write an expanded summary of the following product and an overview of people's experiences based on the provided reviews of it as follows. Format it nicely in markdown:\n\n" + text
+
+# print(prompt_for_ai)
+
+pyperclip.copy(prompt_for_ai)
+
+
+ai_response = ask_ai(prompt_for_ai, model=llms[1])
+print("The answer from AI:\n\n")
+print(ai_response)
+
+pyperclip.copy(ai_response)
\ No newline at end of file
diff --git a/webui.py b/webui.py
new file mode 100644
index 0000000..8587db8
--- /dev/null
+++ b/webui.py
@@ -0,0 +1,21 @@
+import gradio as gr
+
+
+def write_article(url):
+    # Your logic to fetch HTML content from the URL
+    # Replace this with your actual implementation
+    html_content = f"<h1>Sample HTML Content for {url}</h1>"
+    return html_content
+
+
+# Define the Gradio interface
+iface = gr.Interface(
+    fn=write_article,
+    inputs="text",  # Text input for the URL
+    outputs="html",  # Display HTML content
+    title="URL to HTML Converter",
+    description="Enter a URL to get its HTML content."
+)
+
+# Launch the Gradio app
+iface.launch(server_port=7373)