first commit

2024-02-18 03:56:20 +03:30 · 2024-02-18 03:56:20 +03:30 · edd3fcaca5
commit edd3fcaca5
parent ec9a6a3b81
4 changed files with 243 additions and 135 deletions
--- a/product_selector.yml
+++ b/product_selector.yml
@ -0,0 +1,41 @@
 name:
    css: '#productTitle'
    type: Text
 price:
    css: '#price_inside_buybox'
    type: Text
 short_description:
    css: '#featurebullets_feature_div'
    type: Text
 images:
    css: '.imgTagWrapper img'
    type: Attribute
    attribute: data-a-dynamic-image
 rating:
    css: span.arp-rating-out-of-text
    type: Text
 number_of_reviews:
    css: 'a.a-link-normal h2'
    type: Text
 variants:
    css: 'form.a-section li'
    multiple: true
    type: Text
    children:
        name:
            css: ""
            type: Attribute
            attribute: title
        asin:
            css: ""
            type: Attribute
            attribute: data-defaultasin
 product_description:
    css: '#productDescription'
    type: Text
 sales_rank:
    css: 'li#SalesRank'
    type: Text
 link_to_all_reviews:
    css: 'div.card-padding a.a-link-emphasis'
    type: Link
--- a/review_selector.yml
+++ b/review_selector.yml
@ -0,0 +1,38 @@
 product_title:
    css: 'h1 a[data-hook="product-link"]'
    type: Text
 reviews:
    css: 'div.review div.a-section.celwidget'
    multiple: true
    type: Text
    children:
        title:
            css: a.review-title
            type: Text
        content:
            css: 'div.a-row.review-data span.review-text'
            type: Text
        date:
            css: span.a-size-base.a-color-secondary
            type: Text
        variant:
            css: 'a.a-size-mini'
            type: Text
        images:
            css: img.review-image-tile
            multiple: true
            type: Attribute
            attribute: src
        verified:
            css: 'span[data-hook="avp-badge"]'
            type: Text
        author:
            css: span.a-profile-name
            type: Text
        rating:
            css: 'div.a-row:nth-of-type(2) > a.a-link-normal:nth-of-type(1)'
            type: Attribute
            attribute: title
 next_page:
    css: 'li.a-last a'
    type: Link
--- a/scrape_amazon.py
+++ b/scrape_amazon.py
@ -9,155 +9,168 @@ from openai import OpenAI
 import os
-PROXY_HOST = 'localhost'
+class AmazonScraper:
-PROXY_PORT = 1091
+    def __init__(self):
        PROXY_HOST = 'localhost'
        PROXY_PORT = 1091
        # self.images = []
-proxy_dict = {
+        self.proxy_dict = {
-    'http': f'socks5h://{PROXY_HOST}:{PROXY_PORT}',
+            'http': f'socks5h://{PROXY_HOST}:{PROXY_PORT}',
-    'https': f'socks5h://{PROXY_HOST}:{PROXY_PORT}'
+            'https': f'socks5h://{PROXY_HOST}:{PROXY_PORT}'
-}
+        }
-HEADERS = {
+        HEADERS = {
-    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
+            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
-    'Accept-Language': 'en-US, en;q=0.5'
+            'Accept-Language': 'en-US, en;q=0.5'
-}
+        }
-HEADERS = {
+        self.HEADERS = {
-        'authority': 'www.amazon.com',
+            'authority': 'www.amazon.com',
-        'pragma': 'no-cache',
+            'pragma': 'no-cache',
-        'cache-control': 'no-cache',
+            'cache-control': 'no-cache',
-        'dnt': '1',
+            'dnt': '1',
-        'upgrade-insecure-requests': '1',
+            'upgrade-insecure-requests': '1',
-        'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
+            'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
-        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
+            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
-        'sec-fetch-site': 'none',
+            'sec-fetch-site': 'none',
-        'sec-fetch-mode': 'navigate',
+            'sec-fetch-mode': 'navigate',
-        'sec-fetch-dest': 'document',
+            'sec-fetch-dest': 'document',
-        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
+            'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
-    }
+        }
    def get_real_url_from_shortlink(self, short_url):
        response = requests.get(short_url, headers=self.HEADERS, proxies=self.proxy_dict)
        return response.url
-def get_real_url_from_shortlink(short_url):
+    def extract_asin(self, product_url):
-    response = requests.get(short_url, headers=HEADERS, proxies=proxy_dict)
+        # Extract the ASIN from the product URL
-    return response.url
+        match = re.search(r'/dp/([A-Z0-9]+)', product_url)
-
+        if match:
-
+            return match.group(1)
 def extract_asin(product_url):
    # Extract the ASIN from the product URL
    match = re.search(r'/dp/([A-Z0-9]+)', product_url)
    if match:
        return match.group(1)
    else:
        return None
 def generate_review_url(product_url):
    base_review_url = "https://www.amazon.com/product-reviews/"
    asin = extract_asin(product_url)
    if asin:
        review_url = f"{base_review_url}{asin}"
        return review_url
    else:
        return None
 def scrape_amazon_product(product_url):
    product_url = get_real_url_from_shortlink(product_url)
    response = requests.get(product_url, headers=HEADERS, proxies=proxy_dict)
    if response.status_code > 500:
        if "To discuss automated access to Amazon data please contact" in response.text:
            print("Page %s was blocked by Amazon. Please try using better proxies\n" % url)
        else:
-            print("Page %s must have been blocked by Amazon as the status code was %d" % (url, response.status_code))
+            return None
        return None
    #
    # soup = BeautifulSoup(response.content, 'html.parser')
    #
    # # Extract relevant information
    # product_title = soup.find('span', {'id': 'productTitle'}).text.strip()
    # product_rating = soup.find('span', {'class': 'a-icon-alt'}).text.strip()
    # review_count = soup.find('span', {'id': 'acrCustomerReviewText'}).text.strip()
-    e = Extractor.from_yaml_file('product_selector.yml')
+    def generate_review_url(self, product_url):
-    product_info = e.extract(response.text)
+        base_review_url = "https://www.amazon.com/product-reviews/"
-    # Get link to reviews page
+        asin = self.extract_asin(product_url)
-    reviews_link = generate_review_url(product_url)
+        if asin:
            review_url = f"{base_review_url}{asin}"
            return review_url
        else:
            return None
-    # Load the Selectorlib YAML file (selectors.yml)
+    def scrape_amazon_product(self, product_url):
-    # You can customize this file to specify which data fields to extract
+        product_url = self.get_real_url_from_shortlink(product_url)
-    # For example, review title, review content, rating, etc.
+        response = requests.get(product_url, headers=self.HEADERS, proxies=self.proxy_dict)
    review_selector_file = "review_selector.yml"
    e = Extractor.from_yaml_file(review_selector_file)
-    # Send an HTTP request to the review page
+        if response.status_code > 500:
-    reviews_response = requests.get(reviews_link, headers=HEADERS, proxies=proxy_dict)
+            if "To discuss automated access to Amazon data please contact" in response.text:
                print("Page %s was blocked by Amazon. Please try using better proxies\n" % product_url)
            else:
                print(
                    "Page %s must have been blocked by Amazon as the status code was %d" % (product_url, response.status_code))
            return None
        #
        # soup = BeautifulSoup(response.content, 'html.parser')
        #
        # # Extract relevant information
        # product_title = soup.find('span', {'id': 'productTitle'}).text.strip()
        # product_rating = soup.find('span', {'class': 'a-icon-alt'}).text.strip()
        # review_count = soup.find('span', {'id': 'acrCustomerReviewText'}).text.strip()
-    # print(reviews_response.text)
+        e = Extractor.from_yaml_file('product_selector.yml')
-    # Extract review data using the Selectorlib
+        product_info = e.extract(response.text)
-    review_data = e.extract(reviews_response.text)
+        # Get link to reviews page
        reviews_link = self.generate_review_url(product_url)
-    return {
+        # Load the Selectorlib YAML file (selectors.yml)
-        # 'Title': product_title,
+        # You can customize this file to specify which data fields to extract
-        # 'Rating': product_rating,
+        # For example, review title, review content, rating, etc.
-        # 'Reviews': review_count,
+        review_selector_file = "review_selector.yml"
-        # 'Reviews Link': reviews_link,
+        e = Extractor.from_yaml_file(review_selector_file)
        'info': product_info,
        'review texts': review_data  # Get the first 3 reviews (you can adjust this as needed)
    }
-def get_product_info_and_reviews(product_url):
+        # Send an HTTP request to the review page
-    product_info = scrape_amazon_product(url)
+        reviews_response = requests.get(reviews_link, headers=self.HEADERS, proxies=self.proxy_dict)
    # print(product_info)
    name = product_info['info']['name']
    description = product_info['info']['product_description'] if product_info['info']['product_description'] is not None else product_info['info']['short_description']
    reviews = ""
    for review in product_info['review texts']['reviews']:
        # print("{}\n{}\n\n".format(review['title'], review['content']))
        reviews += "{}\n{}\n\n".format(review['title'], review['content'])
-    return f"product name : {name}\ndescription : {description}\n\nreviews : \n{reviews}"
+        # print(reviews_response.text)
        # Extract review data using the Selectorlib
        review_data = e.extract(reviews_response.text)
        print(review_data)
        print(product_info)
        print(product_info['images'], type(product_info['images']))
        self.images = eval(product_info['images']).keys()
        print(self.images)
-def ask_ai(prompt, model="mistralai/Mixtral-8x7B-Instruct-v0.1"):
+        return {
-    TOGETHER_API_KEY = "fbd3e65ce35bfa645e9ddc696f51dc705db8eb97a561ed61b52c6435b24bc175"
+            'info': product_info,
            'review texts': review_data  # Get the first 3 reviews (you can adjust this as needed)
        }
-    client = OpenAI(api_key=TOGETHER_API_KEY,
+    def get_product_info_and_reviews(self, product_url):
-                    base_url='https://api.together.xyz',
+        product_info = self.scrape_amazon_product(product_url)
-                    )
+        # print(product_info)
        name = product_info['info']['name']
        description = product_info['info']['product_description'] if product_info['info'][
                                                                         'product_description'] is not None else \
            product_info['info']['short_description']
        reviews = ""
        for review in product_info['review texts']['reviews']:
            # print("{}\n{}\n\n".format(review['title'], review['content']))
            reviews += "{}\n{}\n\n".format(review['title'], review['content'])
        return f"product name : {name}\ndescription : {description}\n\nreviews : \n{reviews}"
 class AIInterface:
    def __init__(self):
        pass
    def ask_ai(self, prompt, model="mistralai/Mixtral-8x7B-Instruct-v0.1"):
        TOGETHER_API_KEY = "fbd3e65ce35bfa645e9ddc696f51dc705db8eb97a561ed61b52c6435b24bc175"
        client = OpenAI(api_key=TOGETHER_API_KEY,
                        base_url='https://api.together.xyz',
                        )
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are an author of a popular product-review weblog",
                },
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model=model,
            max_tokens=4096
        )
        return chat_completion.choices[0].message.content
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You are an author of a popular product-review weblog",
            },
            {
                "role": "user",
                "content": prompt_for_ai,
            }
        ],
        model=model,
        max_tokens=4096
    )
    return chat_completion.choices[0].message.content
 # Define the URL of the Amazon product page
 # url = "https://www.amazon.com/Bark-Spark-Poo-Treats-Coprophagia/dp/B0CHZPFZL7/ref=zg_bsms_c_pet-supplies_d_sccl_3/143-8139391-6089832?pd_rd_w=KLu5Q&content-id=amzn1.sym.309d45c5-3eba-4f62-9bb2-0acdcf0662e7&pf_rd_p=309d45c5-3eba-4f62-9bb2-0acdcf0662e7&pf_rd_r=SYS7AW9XS89XM2EMRCFC&pd_rd_wg=wH6LW&pd_rd_r=b778cb5d-ec2b-4d58-9c0c-3799df0689fa&pd_rd_i=B0CVL3RZBX&psc=1"
-
+#
-llms = ['meta-llama/Llama-2-70b-chat-hf', "mistralai/Mixtral-8x7B-Instruct-v0.1", "togethercomputer/LLaMA-2-7B-32K"]
+# llms = ['meta-llama/Llama-2-70b-chat-hf', "mistralai/Mixtral-8x7B-Instruct-v0.1", "togethercomputer/LLaMA-2-7B-32K"]
-
+#
-url = "https://amzn.to/3wd44FS"
+# url = "https://amzn.to/3wd44FS"
-
+#
-text = get_product_info_and_reviews(url)
+# scraper = AmazonScraper()
-
+# aii = AIInterface()
-prompt_for_ai = "write an expanded summary of the following product and an overview of people's experiences based on the provided reviews of it as follows. Format it nicely in markdown:\n\n" + text
+#
-
+# text = scraper.get_product_info_and_reviews(url)
 #
 # prompt_for_ai = "write an expanded summary of the following product and an overview of people's experiences based on the provided reviews of it as follows. Format it nicely in markdown:\n\n" + text
 #
 #
 # ai_response = aii.ask_ai(prompt_for_ai, model=llms[1])
 #
 # print(prompt_for_ai)
-
+# print("The answer from AI:\n\n")
-pyperclip.copy(prompt_for_ai)
+# print(ai_response)
-
+#
-
+# pyperclip.copy(ai_response)
 ai_response = ask_ai(prompt_for_ai, model=llms[1])
 print("The answer from AI:\n\n")
 print(ai_response)
 pyperclip.copy(ai_response)
--- a/webui.py
+++ b/webui.py
@ -1,21 +1,37 @@
 import gradio as gr
 from scrape_amazon import AmazonScraper, AIInterface
 llms = ['meta-llama/Llama-2-70b-chat-hf', "mistralai/Mixtral-8x7B-Instruct-v0.1", "togethercomputer/LLaMA-2-7B-32K"]
 scraper = AmazonScraper()
 aii = AIInterface()
-def write_article(url):
+def write_article(url, ai_prompt):
    # Your logic to fetch HTML content from the URL
    # Replace this with your actual implementation
-    html_content = f"<h1>Sample HTML Content for {url}</h1>"
+
    text = scraper.get_product_info_and_reviews(url)
    images = list(scraper.images)[0]
    prompt_for_ai = "Write a summary of the following product and an overview of people's experiences based on the provided reviews of it as follows. Format it nicely and professionally in HTML:\n\n" + text
    # prompt_for_ai = f"Write a summary of the following product and an overview of people's experiences based on the provided reviews of it as follows. Format it nicely and professionally in HTML. The title of this product should links to {url}. Also include this image {images} after the first or second paragraph as a link to {url} and <figcaption>Image from Amazon.com</figcaption>:\n\n" + text
    ai_response = aii.ask_ai(prompt_for_ai, model=llms[1])
    print(ai_response)
    html_content = ai_response
    prompt_for_ai = f"Take the following HTML code and slightly modify it by converting the names of this product to links to {url}. Also include this image {images} after the first or second paragraph as a link to {url} and caption it with <figcaption>Image from Amazon.com</figcaption>. Return a nice and professional HTML code:\n" + ai_response
    html_content = aii.ask_ai(prompt_for_ai, model=llms[1])
    print(html_content)
    return html_content
 # Define the Gradio interface
 iface = gr.Interface(
    fn=write_article,
-    inputs="text",  # Text input for the URL
+    inputs=["text", gr.components.Textbox(lines=10, placeholder="Enter AI prompt here...", label="AI Prompt:")],  # Text input for the URL
    outputs="html",  # Display HTML content
    title="URL to HTML Converter",
    description="Enter a URL to get its HTML content."
 )
 # Launch the Gradio app
-iface.launch(server_port=7373)
+iface.launch(server_port=7373, share=True)