diff --git a/product_selector.yml b/product_selector.yml new file mode 100644 index 0000000..61fefe8 --- /dev/null +++ b/product_selector.yml @@ -0,0 +1,41 @@ +name: + css: '#productTitle' + type: Text +price: + css: '#price_inside_buybox' + type: Text +short_description: + css: '#featurebullets_feature_div' + type: Text +images: + css: '.imgTagWrapper img' + type: Attribute + attribute: data-a-dynamic-image +rating: + css: span.arp-rating-out-of-text + type: Text +number_of_reviews: + css: 'a.a-link-normal h2' + type: Text +variants: + css: 'form.a-section li' + multiple: true + type: Text + children: + name: + css: "" + type: Attribute + attribute: title + asin: + css: "" + type: Attribute + attribute: data-defaultasin +product_description: + css: '#productDescription' + type: Text +sales_rank: + css: 'li#SalesRank' + type: Text +link_to_all_reviews: + css: 'div.card-padding a.a-link-emphasis' + type: Link \ No newline at end of file diff --git a/review_selector.yml b/review_selector.yml new file mode 100644 index 0000000..b646a22 --- /dev/null +++ b/review_selector.yml @@ -0,0 +1,38 @@ +product_title: + css: 'h1 a[data-hook="product-link"]' + type: Text +reviews: + css: 'div.review div.a-section.celwidget' + multiple: true + type: Text + children: + title: + css: a.review-title + type: Text + content: + css: 'div.a-row.review-data span.review-text' + type: Text + date: + css: span.a-size-base.a-color-secondary + type: Text + variant: + css: 'a.a-size-mini' + type: Text + images: + css: img.review-image-tile + multiple: true + type: Attribute + attribute: src + verified: + css: 'span[data-hook="avp-badge"]' + type: Text + author: + css: span.a-profile-name + type: Text + rating: + css: 'div.a-row:nth-of-type(2) > a.a-link-normal:nth-of-type(1)' + type: Attribute + attribute: title +next_page: + css: 'li.a-last a' + type: Link \ No newline at end of file diff --git a/scrape_amazon.py b/scrape_amazon.py index d364fa5..64b3839 100644 --- a/scrape_amazon.py +++ b/scrape_amazon.py @@ -9,155 +9,168 @@ from openai import OpenAI import os -PROXY_HOST = 'localhost' -PROXY_PORT = 1091 +class AmazonScraper: + def __init__(self): + PROXY_HOST = 'localhost' + PROXY_PORT = 1091 + # self.images = [] -proxy_dict = { - 'http': f'socks5h://{PROXY_HOST}:{PROXY_PORT}', - 'https': f'socks5h://{PROXY_HOST}:{PROXY_PORT}' -} + self.proxy_dict = { + 'http': f'socks5h://{PROXY_HOST}:{PROXY_PORT}', + 'https': f'socks5h://{PROXY_HOST}:{PROXY_PORT}' + } -HEADERS = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', - 'Accept-Language': 'en-US, en;q=0.5' -} + HEADERS = { + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', + 'Accept-Language': 'en-US, en;q=0.5' + } -HEADERS = { - 'authority': 'www.amazon.com', - 'pragma': 'no-cache', - 'cache-control': 'no-cache', - 'dnt': '1', - 'upgrade-insecure-requests': '1', - 'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36', - 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', - 'sec-fetch-site': 'none', - 'sec-fetch-mode': 'navigate', - 'sec-fetch-dest': 'document', - 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', - } + self.HEADERS = { + 'authority': 'www.amazon.com', + 'pragma': 'no-cache', + 'cache-control': 'no-cache', + 'dnt': '1', + 'upgrade-insecure-requests': '1', + 'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36', + 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', + 'sec-fetch-site': 'none', + 'sec-fetch-mode': 'navigate', + 'sec-fetch-dest': 'document', + 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', + } + def get_real_url_from_shortlink(self, short_url): + response = requests.get(short_url, headers=self.HEADERS, proxies=self.proxy_dict) + return response.url -def get_real_url_from_shortlink(short_url): - response = requests.get(short_url, headers=HEADERS, proxies=proxy_dict) - return response.url - - -def extract_asin(product_url): - # Extract the ASIN from the product URL - match = re.search(r'/dp/([A-Z0-9]+)', product_url) - if match: - return match.group(1) - else: - return None - -def generate_review_url(product_url): - base_review_url = "https://www.amazon.com/product-reviews/" - asin = extract_asin(product_url) - if asin: - review_url = f"{base_review_url}{asin}" - return review_url - else: - return None - -def scrape_amazon_product(product_url): - product_url = get_real_url_from_shortlink(product_url) - response = requests.get(product_url, headers=HEADERS, proxies=proxy_dict) - - if response.status_code > 500: - if "To discuss automated access to Amazon data please contact" in response.text: - print("Page %s was blocked by Amazon. Please try using better proxies\n" % url) + def extract_asin(self, product_url): + # Extract the ASIN from the product URL + match = re.search(r'/dp/([A-Z0-9]+)', product_url) + if match: + return match.group(1) else: - print("Page %s must have been blocked by Amazon as the status code was %d" % (url, response.status_code)) - return None - # - # soup = BeautifulSoup(response.content, 'html.parser') - # - # # Extract relevant information - # product_title = soup.find('span', {'id': 'productTitle'}).text.strip() - # product_rating = soup.find('span', {'class': 'a-icon-alt'}).text.strip() - # review_count = soup.find('span', {'id': 'acrCustomerReviewText'}).text.strip() + return None - e = Extractor.from_yaml_file('product_selector.yml') - product_info = e.extract(response.text) - # Get link to reviews page - reviews_link = generate_review_url(product_url) + def generate_review_url(self, product_url): + base_review_url = "https://www.amazon.com/product-reviews/" + asin = self.extract_asin(product_url) + if asin: + review_url = f"{base_review_url}{asin}" + return review_url + else: + return None - # Load the Selectorlib YAML file (selectors.yml) - # You can customize this file to specify which data fields to extract - # For example, review title, review content, rating, etc. - review_selector_file = "review_selector.yml" - e = Extractor.from_yaml_file(review_selector_file) + def scrape_amazon_product(self, product_url): + product_url = self.get_real_url_from_shortlink(product_url) + response = requests.get(product_url, headers=self.HEADERS, proxies=self.proxy_dict) - # Send an HTTP request to the review page - reviews_response = requests.get(reviews_link, headers=HEADERS, proxies=proxy_dict) + if response.status_code > 500: + if "To discuss automated access to Amazon data please contact" in response.text: + print("Page %s was blocked by Amazon. Please try using better proxies\n" % product_url) + else: + print( + "Page %s must have been blocked by Amazon as the status code was %d" % (product_url, response.status_code)) + return None + # + # soup = BeautifulSoup(response.content, 'html.parser') + # + # # Extract relevant information + # product_title = soup.find('span', {'id': 'productTitle'}).text.strip() + # product_rating = soup.find('span', {'class': 'a-icon-alt'}).text.strip() + # review_count = soup.find('span', {'id': 'acrCustomerReviewText'}).text.strip() - # print(reviews_response.text) - # Extract review data using the Selectorlib - review_data = e.extract(reviews_response.text) + e = Extractor.from_yaml_file('product_selector.yml') + product_info = e.extract(response.text) + # Get link to reviews page + reviews_link = self.generate_review_url(product_url) - return { - # 'Title': product_title, - # 'Rating': product_rating, - # 'Reviews': review_count, - # 'Reviews Link': reviews_link, - 'info': product_info, - 'review texts': review_data # Get the first 3 reviews (you can adjust this as needed) - } + # Load the Selectorlib YAML file (selectors.yml) + # You can customize this file to specify which data fields to extract + # For example, review title, review content, rating, etc. + review_selector_file = "review_selector.yml" + e = Extractor.from_yaml_file(review_selector_file) -def get_product_info_and_reviews(product_url): - product_info = scrape_amazon_product(url) - # print(product_info) - name = product_info['info']['name'] - description = product_info['info']['product_description'] if product_info['info']['product_description'] is not None else product_info['info']['short_description'] - reviews = "" - for review in product_info['review texts']['reviews']: - # print("{}\n{}\n\n".format(review['title'], review['content'])) - reviews += "{}\n{}\n\n".format(review['title'], review['content']) + # Send an HTTP request to the review page + reviews_response = requests.get(reviews_link, headers=self.HEADERS, proxies=self.proxy_dict) - return f"product name : {name}\ndescription : {description}\n\nreviews : \n{reviews}" + # print(reviews_response.text) + # Extract review data using the Selectorlib + review_data = e.extract(reviews_response.text) + + print(review_data) + print(product_info) + print(product_info['images'], type(product_info['images'])) + self.images = eval(product_info['images']).keys() + print(self.images) -def ask_ai(prompt, model="mistralai/Mixtral-8x7B-Instruct-v0.1"): - TOGETHER_API_KEY = "fbd3e65ce35bfa645e9ddc696f51dc705db8eb97a561ed61b52c6435b24bc175" + return { + 'info': product_info, + 'review texts': review_data # Get the first 3 reviews (you can adjust this as needed) + } - client = OpenAI(api_key=TOGETHER_API_KEY, - base_url='https://api.together.xyz', - ) + def get_product_info_and_reviews(self, product_url): + product_info = self.scrape_amazon_product(product_url) + # print(product_info) + name = product_info['info']['name'] + description = product_info['info']['product_description'] if product_info['info'][ + 'product_description'] is not None else \ + product_info['info']['short_description'] + reviews = "" + for review in product_info['review texts']['reviews']: + # print("{}\n{}\n\n".format(review['title'], review['content'])) + reviews += "{}\n{}\n\n".format(review['title'], review['content']) + + return f"product name : {name}\ndescription : {description}\n\nreviews : \n{reviews}" + + +class AIInterface: + def __init__(self): + pass + + def ask_ai(self, prompt, model="mistralai/Mixtral-8x7B-Instruct-v0.1"): + TOGETHER_API_KEY = "fbd3e65ce35bfa645e9ddc696f51dc705db8eb97a561ed61b52c6435b24bc175" + + client = OpenAI(api_key=TOGETHER_API_KEY, + base_url='https://api.together.xyz', + ) + + chat_completion = client.chat.completions.create( + messages=[ + { + "role": "system", + "content": "You are an author of a popular product-review weblog", + }, + { + "role": "user", + "content": prompt, + } + ], + model=model, + max_tokens=4096 + ) + return chat_completion.choices[0].message.content - chat_completion = client.chat.completions.create( - messages=[ - { - "role": "system", - "content": "You are an author of a popular product-review weblog", - }, - { - "role": "user", - "content": prompt_for_ai, - } - ], - model=model, - max_tokens=4096 - ) - return chat_completion.choices[0].message.content # Define the URL of the Amazon product page # url = "https://www.amazon.com/Bark-Spark-Poo-Treats-Coprophagia/dp/B0CHZPFZL7/ref=zg_bsms_c_pet-supplies_d_sccl_3/143-8139391-6089832?pd_rd_w=KLu5Q&content-id=amzn1.sym.309d45c5-3eba-4f62-9bb2-0acdcf0662e7&pf_rd_p=309d45c5-3eba-4f62-9bb2-0acdcf0662e7&pf_rd_r=SYS7AW9XS89XM2EMRCFC&pd_rd_wg=wH6LW&pd_rd_r=b778cb5d-ec2b-4d58-9c0c-3799df0689fa&pd_rd_i=B0CVL3RZBX&psc=1" - -llms = ['meta-llama/Llama-2-70b-chat-hf', "mistralai/Mixtral-8x7B-Instruct-v0.1", "togethercomputer/LLaMA-2-7B-32K"] - -url = "https://amzn.to/3wd44FS" - -text = get_product_info_and_reviews(url) - -prompt_for_ai = "write an expanded summary of the following product and an overview of people's experiences based on the provided reviews of it as follows. Format it nicely in markdown:\n\n" + text - +# +# llms = ['meta-llama/Llama-2-70b-chat-hf', "mistralai/Mixtral-8x7B-Instruct-v0.1", "togethercomputer/LLaMA-2-7B-32K"] +# +# url = "https://amzn.to/3wd44FS" +# +# scraper = AmazonScraper() +# aii = AIInterface() +# +# text = scraper.get_product_info_and_reviews(url) +# +# prompt_for_ai = "write an expanded summary of the following product and an overview of people's experiences based on the provided reviews of it as follows. Format it nicely in markdown:\n\n" + text +# +# +# ai_response = aii.ask_ai(prompt_for_ai, model=llms[1]) +# # print(prompt_for_ai) - -pyperclip.copy(prompt_for_ai) - - -ai_response = ask_ai(prompt_for_ai, model=llms[1]) -print("The answer from AI:\n\n") -print(ai_response) - -pyperclip.copy(ai_response) \ No newline at end of file +# print("The answer from AI:\n\n") +# print(ai_response) +# +# pyperclip.copy(ai_response) diff --git a/webui.py b/webui.py index 8587db8..4b90610 100644 --- a/webui.py +++ b/webui.py @@ -1,21 +1,37 @@ import gradio as gr +from scrape_amazon import AmazonScraper, AIInterface + +llms = ['meta-llama/Llama-2-70b-chat-hf', "mistralai/Mixtral-8x7B-Instruct-v0.1", "togethercomputer/LLaMA-2-7B-32K"] +scraper = AmazonScraper() +aii = AIInterface() -def write_article(url): +def write_article(url, ai_prompt): # Your logic to fetch HTML content from the URL # Replace this with your actual implementation - html_content = f"

Sample HTML Content for {url}

" + + text = scraper.get_product_info_and_reviews(url) + images = list(scraper.images)[0] + prompt_for_ai = "Write a summary of the following product and an overview of people's experiences based on the provided reviews of it as follows. Format it nicely and professionally in HTML:\n\n" + text + # prompt_for_ai = f"Write a summary of the following product and an overview of people's experiences based on the provided reviews of it as follows. Format it nicely and professionally in HTML. The title of this product should links to {url}. Also include this image {images} after the first or second paragraph as a link to {url} and
Image from Amazon.com
:\n\n" + text + ai_response = aii.ask_ai(prompt_for_ai, model=llms[1]) + + print(ai_response) + html_content = ai_response + prompt_for_ai = f"Take the following HTML code and slightly modify it by converting the names of this product to links to {url}. Also include this image {images} after the first or second paragraph as a link to {url} and caption it with
Image from Amazon.com
. Return a nice and professional HTML code:\n" + ai_response + html_content = aii.ask_ai(prompt_for_ai, model=llms[1]) + print(html_content) return html_content # Define the Gradio interface iface = gr.Interface( fn=write_article, - inputs="text", # Text input for the URL + inputs=["text", gr.components.Textbox(lines=10, placeholder="Enter AI prompt here...", label="AI Prompt:")], # Text input for the URL outputs="html", # Display HTML content title="URL to HTML Converter", description="Enter a URL to get its HTML content." ) # Launch the Gradio app -iface.launch(server_port=7373) +iface.launch(server_port=7373, share=True)