2024-02-17 22:04:06 +03:30
import requests
from bs4 import BeautifulSoup
from urllib . parse import urljoin
import pandas as pd
from selectorlib import Extractor
import re
import pyperclip
from openai import OpenAI
2024-02-19 23:44:01 +03:30
2024-02-17 22:04:06 +03:30
import os
2024-02-18 03:56:20 +03:30
class AmazonScraper :
def __init__ ( self ) :
PROXY_HOST = ' localhost '
PROXY_PORT = 1091
# self.images = []
self . proxy_dict = {
' http ' : f ' socks5h:// { PROXY_HOST } : { PROXY_PORT } ' ,
' https ' : f ' socks5h:// { PROXY_HOST } : { PROXY_PORT } '
}
2024-02-19 23:44:01 +03:30
self . HEADERS = {
2024-02-18 03:56:20 +03:30
' User-Agent ' : ' Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36 ' ,
' Accept-Language ' : ' en-US, en;q=0.5 '
}
2024-02-19 23:44:01 +03:30
HEADERS = {
2024-02-18 03:56:20 +03:30
' authority ' : ' www.amazon.com ' ,
' pragma ' : ' no-cache ' ,
' cache-control ' : ' no-cache ' ,
' dnt ' : ' 1 ' ,
' upgrade-insecure-requests ' : ' 1 ' ,
' user-agent ' : ' Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36 ' ,
' accept ' : ' text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9 ' ,
' sec-fetch-site ' : ' none ' ,
' sec-fetch-mode ' : ' navigate ' ,
' sec-fetch-dest ' : ' document ' ,
' accept-language ' : ' en-GB,en-US;q=0.9,en;q=0.8 ' ,
}
def get_real_url_from_shortlink ( self , short_url ) :
response = requests . get ( short_url , headers = self . HEADERS , proxies = self . proxy_dict )
return response . url
def extract_asin ( self , product_url ) :
# Extract the ASIN from the product URL
match = re . search ( r ' /dp/([A-Z0-9]+) ' , product_url )
if match :
return match . group ( 1 )
2024-02-17 22:04:06 +03:30
else :
2024-02-18 03:56:20 +03:30
return None
def generate_review_url ( self , product_url ) :
base_review_url = " https://www.amazon.com/product-reviews/ "
asin = self . extract_asin ( product_url )
if asin :
review_url = f " { base_review_url } { asin } "
return review_url
else :
return None
def scrape_amazon_product ( self , product_url ) :
product_url = self . get_real_url_from_shortlink ( product_url )
response = requests . get ( product_url , headers = self . HEADERS , proxies = self . proxy_dict )
if response . status_code > 500 :
if " To discuss automated access to Amazon data please contact " in response . text :
print ( " Page %s was blocked by Amazon. Please try using better proxies \n " % product_url )
else :
print (
" Page %s must have been blocked by Amazon as the status code was %d " % ( product_url , response . status_code ) )
return None
#
# soup = BeautifulSoup(response.content, 'html.parser')
#
# # Extract relevant information
# product_title = soup.find('span', {'id': 'productTitle'}).text.strip()
# product_rating = soup.find('span', {'class': 'a-icon-alt'}).text.strip()
# review_count = soup.find('span', {'id': 'acrCustomerReviewText'}).text.strip()
e = Extractor . from_yaml_file ( ' product_selector.yml ' )
product_info = e . extract ( response . text )
# Get link to reviews page
reviews_link = self . generate_review_url ( product_url )
# Load the Selectorlib YAML file (selectors.yml)
# You can customize this file to specify which data fields to extract
# For example, review title, review content, rating, etc.
review_selector_file = " review_selector.yml "
e = Extractor . from_yaml_file ( review_selector_file )
# Send an HTTP request to the review page
reviews_response = requests . get ( reviews_link , headers = self . HEADERS , proxies = self . proxy_dict )
# print(reviews_response.text)
# Extract review data using the Selectorlib
review_data = e . extract ( reviews_response . text )
print ( review_data )
print ( product_info )
print ( product_info [ ' images ' ] , type ( product_info [ ' images ' ] ) )
2024-02-19 23:44:01 +03:30
2024-02-18 21:02:26 +03:30
self . images = eval ( product_info [ ' images ' ] )
2024-02-19 23:44:01 +03:30
2024-02-18 03:56:20 +03:30
print ( self . images )
return {
' info ' : product_info ,
' review texts ' : review_data # Get the first 3 reviews (you can adjust this as needed)
}
def get_product_info_and_reviews ( self , product_url ) :
product_info = self . scrape_amazon_product ( product_url )
# print(product_info)
name = product_info [ ' info ' ] [ ' name ' ]
description = product_info [ ' info ' ] [ ' product_description ' ] if product_info [ ' info ' ] [
' product_description ' ] is not None else \
product_info [ ' info ' ] [ ' short_description ' ]
reviews = " "
for review in product_info [ ' review texts ' ] [ ' reviews ' ] :
# print("{}\n{}\n\n".format(review['title'], review['content']))
reviews + = " {} \n {} \n \n " . format ( review [ ' title ' ] , review [ ' content ' ] )
return f " product name : { name } \n description : { description } \n \n reviews : \n { reviews } "
class AIInterface :
def __init__ ( self ) :
pass
def ask_ai ( self , prompt , model = " mistralai/Mixtral-8x7B-Instruct-v0.1 " ) :
TOGETHER_API_KEY = " fbd3e65ce35bfa645e9ddc696f51dc705db8eb97a561ed61b52c6435b24bc175 "
client = OpenAI ( api_key = TOGETHER_API_KEY ,
base_url = ' https://api.together.xyz ' ,
)
2024-02-19 23:44:01 +03:30
# client._proxies
2024-02-18 03:56:20 +03:30
chat_completion = client . chat . completions . create (
messages = [
{
" role " : " system " ,
" content " : " You are an author of a popular product-review weblog " ,
} ,
{
" role " : " user " ,
" content " : prompt ,
}
] ,
model = model ,
max_tokens = 4096
)
return chat_completion . choices [ 0 ] . message . content
2024-02-17 22:04:06 +03:30
# Define the URL of the Amazon product page
# url = "https://www.amazon.com/Bark-Spark-Poo-Treats-Coprophagia/dp/B0CHZPFZL7/ref=zg_bsms_c_pet-supplies_d_sccl_3/143-8139391-6089832?pd_rd_w=KLu5Q&content-id=amzn1.sym.309d45c5-3eba-4f62-9bb2-0acdcf0662e7&pf_rd_p=309d45c5-3eba-4f62-9bb2-0acdcf0662e7&pf_rd_r=SYS7AW9XS89XM2EMRCFC&pd_rd_wg=wH6LW&pd_rd_r=b778cb5d-ec2b-4d58-9c0c-3799df0689fa&pd_rd_i=B0CVL3RZBX&psc=1"
2024-02-18 03:56:20 +03:30
#
# llms = ['meta-llama/Llama-2-70b-chat-hf', "mistralai/Mixtral-8x7B-Instruct-v0.1", "togethercomputer/LLaMA-2-7B-32K"]
#
# url = "https://amzn.to/3wd44FS"
#
# scraper = AmazonScraper()
# aii = AIInterface()
#
# text = scraper.get_product_info_and_reviews(url)
#
# prompt_for_ai = "write an expanded summary of the following product and an overview of people's experiences based on the provided reviews of it as follows. Format it nicely in markdown:\n\n" + text
#
#
# ai_response = aii.ask_ai(prompt_for_ai, model=llms[1])
#
2024-02-17 22:04:06 +03:30
# print(prompt_for_ai)
2024-02-18 03:56:20 +03:30
# print("The answer from AI:\n\n")
# print(ai_response)
#
# pyperclip.copy(ai_response)