ai_article_writer_web_ui/scrape_amazon.py

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
from selectorlib import Extractor
import re
import pyperclip
from openai import OpenAI

import os


class AmazonScraper:
    def __init__(self):
        PROXY_HOST = 'localhost'
        PROXY_PORT = 1091
        # self.images = []

        self.proxy_dict = {
            'http': f'socks5h://{PROXY_HOST}:{PROXY_PORT}',
            'https': f'socks5h://{PROXY_HOST}:{PROXY_PORT}'
        }

        self.HEADERS = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
            'Accept-Language': 'en-US, en;q=0.5'
        }

        HEADERS = {
            'authority': 'www.amazon.com',
            'pragma': 'no-cache',
            'cache-control': 'no-cache',
            'dnt': '1',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'sec-fetch-site': 'none',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-dest': 'document',
            'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
        }

    def get_real_url_from_shortlink(self, short_url):
        response = requests.get(short_url, headers=self.HEADERS, proxies=self.proxy_dict)
        return response.url

    def extract_asin(self, product_url):
        # Extract the ASIN from the product URL
        match = re.search(r'/dp/([A-Z0-9]+)', product_url)
        if match:
            return match.group(1)
        else:
            return None

    def generate_review_url(self, product_url):
        base_review_url = "https://www.amazon.com/product-reviews/"
        asin = self.extract_asin(product_url)
        if asin:
            review_url = f"{base_review_url}{asin}"
            return review_url
        else:
            return None

    def scrape_amazon_product(self, product_url):
        product_url = self.get_real_url_from_shortlink(product_url)
        response = requests.get(product_url, headers=self.HEADERS, proxies=self.proxy_dict)

        if response.status_code > 500:
            if "To discuss automated access to Amazon data please contact" in response.text:
                print("Page %s was blocked by Amazon. Please try using better proxies\n" % product_url)
            else:
                print(
                    "Page %s must have been blocked by Amazon as the status code was %d" % (product_url, response.status_code))
            return None
        #
        # soup = BeautifulSoup(response.content, 'html.parser')
        #
        # # Extract relevant information
        # product_title = soup.find('span', {'id': 'productTitle'}).text.strip()
        # product_rating = soup.find('span', {'class': 'a-icon-alt'}).text.strip()
        # review_count = soup.find('span', {'id': 'acrCustomerReviewText'}).text.strip()

        e = Extractor.from_yaml_file('product_selector.yml')
        product_info = e.extract(response.text)
        # Get link to reviews page
        reviews_link = self.generate_review_url(product_url)

        # Load the Selectorlib YAML file (selectors.yml)
        # You can customize this file to specify which data fields to extract
        # For example, review title, review content, rating, etc.
        review_selector_file = "review_selector.yml"
        e = Extractor.from_yaml_file(review_selector_file)

        # Send an HTTP request to the review page
        reviews_response = requests.get(reviews_link, headers=self.HEADERS, proxies=self.proxy_dict)

        # print(reviews_response.text)
        # Extract review data using the Selectorlib
        review_data = e.extract(reviews_response.text)

        print(review_data)
        print(product_info)
        print(product_info['images'], type(product_info['images']))

        self.images = eval(product_info['images'])

        print(self.images)


        return {
            'info': product_info,
            'review texts': review_data  # Get the first 3 reviews (you can adjust this as needed)
        }

    def get_product_info_and_reviews(self, product_url):
        product_info = self.scrape_amazon_product(product_url)
        # print(product_info)
        name = product_info['info']['name']
        description = product_info['info']['product_description'] if product_info['info'][
                                                                         'product_description'] is not None else \
            product_info['info']['short_description']
        reviews = ""
        for review in product_info['review texts']['reviews']:
            # print("{}\n{}\n\n".format(review['title'], review['content']))
            reviews += "{}\n{}\n\n".format(review['title'], review['content'])

        return f"product name : {name}\ndescription : {description}\n\nreviews : \n{reviews}"


class AIInterface:
    def __init__(self):
        pass

    def ask_ai(self, prompt, model="mistralai/Mixtral-8x7B-Instruct-v0.1"):
        TOGETHER_API_KEY = "fbd3e65ce35bfa645e9ddc696f51dc705db8eb97a561ed61b52c6435b24bc175"

        client = OpenAI(api_key=TOGETHER_API_KEY,
                        base_url='https://api.together.xyz',
                        )

        # client._proxies
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are an author of a popular product-review weblog",
                },
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model=model,
            max_tokens=4096
        )
        return chat_completion.choices[0].message.content


# Define the URL of the Amazon product page
# url = "https://www.amazon.com/Bark-Spark-Poo-Treats-Coprophagia/dp/B0CHZPFZL7/ref=zg_bsms_c_pet-supplies_d_sccl_3/143-8139391-6089832?pd_rd_w=KLu5Q&content-id=amzn1.sym.309d45c5-3eba-4f62-9bb2-0acdcf0662e7&pf_rd_p=309d45c5-3eba-4f62-9bb2-0acdcf0662e7&pf_rd_r=SYS7AW9XS89XM2EMRCFC&pd_rd_wg=wH6LW&pd_rd_r=b778cb5d-ec2b-4d58-9c0c-3799df0689fa&pd_rd_i=B0CVL3RZBX&psc=1"
#
# llms = ['meta-llama/Llama-2-70b-chat-hf', "mistralai/Mixtral-8x7B-Instruct-v0.1", "togethercomputer/LLaMA-2-7B-32K"]
#
# url = "https://amzn.to/3wd44FS"
#
# scraper = AmazonScraper()
# aii = AIInterface()
#
# text = scraper.get_product_info_and_reviews(url)
#
# prompt_for_ai = "write an expanded summary of the following product and an overview of people's experiences based on the provided reviews of it as follows. Format it nicely in markdown:\n\n" + text
#
#
# ai_response = aii.ask_ai(prompt_for_ai, model=llms[1])
#
# print(prompt_for_ai)
# print("The answer from AI:\n\n")
# print(ai_response)
#
# pyperclip.copy(ai_response)
first commit 2024-02-17 22:04:06 +03:30			`import requests`
			`from bs4 import BeautifulSoup`
			`from urllib.parse import urljoin`
			`import pandas as pd`
			`from selectorlib import Extractor`
			`import re`
			`import pyperclip`
			`from openai import OpenAI`
produces the correct html code and summary. Next objective is to be able to push it to wordpress directly from this gradio UI. 2024-02-19 23:44:01 +03:30
first commit 2024-02-17 22:04:06 +03:30			`import os`


first commit 2024-02-18 03:56:20 +03:30			`class AmazonScraper:`
			`def __init__(self):`
			`PROXY_HOST = 'localhost'`
			`PROXY_PORT = 1091`
			`# self.images = []`

			`self.proxy_dict = {`
			`'http': f'socks5h://{PROXY_HOST}:{PROXY_PORT}',`
			`'https': f'socks5h://{PROXY_HOST}:{PROXY_PORT}'`
			`}`

produces the correct html code and summary. Next objective is to be able to push it to wordpress directly from this gradio UI. 2024-02-19 23:44:01 +03:30			`self.HEADERS = {`
first commit 2024-02-18 03:56:20 +03:30			`'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',`
			`'Accept-Language': 'en-US, en;q=0.5'`
			`}`

produces the correct html code and summary. Next objective is to be able to push it to wordpress directly from this gradio UI. 2024-02-19 23:44:01 +03:30			`HEADERS = {`
first commit 2024-02-18 03:56:20 +03:30			`'authority': 'www.amazon.com',`
			`'pragma': 'no-cache',`
			`'cache-control': 'no-cache',`
			`'dnt': '1',`
			`'upgrade-insecure-requests': '1',`
			`'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',`
			`'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.9',`
			`'sec-fetch-site': 'none',`
			`'sec-fetch-mode': 'navigate',`
			`'sec-fetch-dest': 'document',`
			`'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',`
			`}`

			`def get_real_url_from_shortlink(self, short_url):`
			`response = requests.get(short_url, headers=self.HEADERS, proxies=self.proxy_dict)`
			`return response.url`

			`def extract_asin(self, product_url):`
			`# Extract the ASIN from the product URL`
			`match = re.search(r'/dp/([A-Z0-9]+)', product_url)`
			`if match:`
			`return match.group(1)`
first commit 2024-02-17 22:04:06 +03:30			`else:`
first commit 2024-02-18 03:56:20 +03:30			`return None`

			`def generate_review_url(self, product_url):`
			`base_review_url = "https://www.amazon.com/product-reviews/"`
			`asin = self.extract_asin(product_url)`
			`if asin:`
			`review_url = f"{base_review_url}{asin}"`
			`return review_url`
			`else:`
			`return None`

			`def scrape_amazon_product(self, product_url):`
			`product_url = self.get_real_url_from_shortlink(product_url)`
			`response = requests.get(product_url, headers=self.HEADERS, proxies=self.proxy_dict)`

			`if response.status_code > 500:`
			`if "To discuss automated access to Amazon data please contact" in response.text:`
			`print("Page %s was blocked by Amazon. Please try using better proxies\n" % product_url)`
			`else:`
			`print(`
			`"Page %s must have been blocked by Amazon as the status code was %d" % (product_url, response.status_code))`
			`return None`
			`#`
			`# soup = BeautifulSoup(response.content, 'html.parser')`
			`#`
			`# # Extract relevant information`
			`# product_title = soup.find('span', {'id': 'productTitle'}).text.strip()`
			`# product_rating = soup.find('span', {'class': 'a-icon-alt'}).text.strip()`
			`# review_count = soup.find('span', {'id': 'acrCustomerReviewText'}).text.strip()`

			`e = Extractor.from_yaml_file('product_selector.yml')`
			`product_info = e.extract(response.text)`
			`# Get link to reviews page`
			`reviews_link = self.generate_review_url(product_url)`

			`# Load the Selectorlib YAML file (selectors.yml)`
			`# You can customize this file to specify which data fields to extract`
			`# For example, review title, review content, rating, etc.`
			`review_selector_file = "review_selector.yml"`
			`e = Extractor.from_yaml_file(review_selector_file)`

			`# Send an HTTP request to the review page`
			`reviews_response = requests.get(reviews_link, headers=self.HEADERS, proxies=self.proxy_dict)`

			`# print(reviews_response.text)`
			`# Extract review data using the Selectorlib`
			`review_data = e.extract(reviews_response.text)`

			`print(review_data)`
			`print(product_info)`
			`print(product_info['images'], type(product_info['images']))`
produces the correct html code and summary. Next objective is to be able to push it to wordpress directly from this gradio UI. 2024-02-19 23:44:01 +03:30
produces the correct html code and summary. Next objective is to be able to push it to wordpress directly from this gradio UI. 2024-02-18 21:02:26 +03:30			`self.images = eval(product_info['images'])`
produces the correct html code and summary. Next objective is to be able to push it to wordpress directly from this gradio UI. 2024-02-19 23:44:01 +03:30
first commit 2024-02-18 03:56:20 +03:30			`print(self.images)`


			`return {`
			`'info': product_info,`
			`'review texts': review_data # Get the first 3 reviews (you can adjust this as needed)`
			`}`

			`def get_product_info_and_reviews(self, product_url):`
			`product_info = self.scrape_amazon_product(product_url)`
			`# print(product_info)`
			`name = product_info['info']['name']`
			`description = product_info['info']['product_description'] if product_info['info'][`
			`'product_description'] is not None else \`
			`product_info['info']['short_description']`
			`reviews = ""`
			`for review in product_info['review texts']['reviews']:`
			`# print("{}\n{}\n\n".format(review['title'], review['content']))`
			`reviews += "{}\n{}\n\n".format(review['title'], review['content'])`

			`return f"product name : {name}\ndescription : {description}\n\nreviews : \n{reviews}"`


			`class AIInterface:`
			`def __init__(self):`
			`pass`

			`def ask_ai(self, prompt, model="mistralai/Mixtral-8x7B-Instruct-v0.1"):`
			`TOGETHER_API_KEY = "fbd3e65ce35bfa645e9ddc696f51dc705db8eb97a561ed61b52c6435b24bc175"`

			`client = OpenAI(api_key=TOGETHER_API_KEY,`
			`base_url='https://api.together.xyz',`
			`)`

produces the correct html code and summary. Next objective is to be able to push it to wordpress directly from this gradio UI. 2024-02-19 23:44:01 +03:30			`# client._proxies`
first commit 2024-02-18 03:56:20 +03:30			`chat_completion = client.chat.completions.create(`
			`messages=[`
			`{`
			`"role": "system",`
			`"content": "You are an author of a popular product-review weblog",`
			`},`
			`{`
			`"role": "user",`
			`"content": prompt,`
			`}`
			`],`
			`model=model,`
			`max_tokens=4096`
			`)`
			`return chat_completion.choices[0].message.content`

first commit 2024-02-17 22:04:06 +03:30
			`# Define the URL of the Amazon product page`
			`# url = "https://www.amazon.com/Bark-Spark-Poo-Treats-Coprophagia/dp/B0CHZPFZL7/ref=zg_bsms_c_pet-supplies_d_sccl_3/143-8139391-6089832?pd_rd_w=KLu5Q&content-id=amzn1.sym.309d45c5-3eba-4f62-9bb2-0acdcf0662e7&pf_rd_p=309d45c5-3eba-4f62-9bb2-0acdcf0662e7&pf_rd_r=SYS7AW9XS89XM2EMRCFC&pd_rd_wg=wH6LW&pd_rd_r=b778cb5d-ec2b-4d58-9c0c-3799df0689fa&pd_rd_i=B0CVL3RZBX&psc=1"`
first commit 2024-02-18 03:56:20 +03:30			`#`
			`# llms = ['meta-llama/Llama-2-70b-chat-hf', "mistralai/Mixtral-8x7B-Instruct-v0.1", "togethercomputer/LLaMA-2-7B-32K"]`
			`#`
			`# url = "https://amzn.to/3wd44FS"`
			`#`
			`# scraper = AmazonScraper()`
			`# aii = AIInterface()`
			`#`
			`# text = scraper.get_product_info_and_reviews(url)`
			`#`
			`# prompt_for_ai = "write an expanded summary of the following product and an overview of people's experiences based on the provided reviews of it as follows. Format it nicely in markdown:\n\n" + text`
			`#`
			`#`
			`# ai_response = aii.ask_ai(prompt_for_ai, model=llms[1])`
			`#`
first commit 2024-02-17 22:04:06 +03:30			`# print(prompt_for_ai)`
first commit 2024-02-18 03:56:20 +03:30			`# print("The answer from AI:\n\n")`
			`# print(ai_response)`
			`#`
			`# pyperclip.copy(ai_response)`