1. 营销策划师首页

如何用爬虫抓取电商平台数据2016年爬取热销top女装类目数据(ATTENTION-)

如何用爬虫抓取电商平台数据2016年爬取热销top女装类目数据(ATTENTION-)目标获取某大型国外电商网站数据,链接本次目标是爬取热销top女装类目数据,如下:具体字段包括sku详情页标题、价格、主图等:以及所有评论(评价时间、评价星级、购买产品属性明细-颜色&尺寸、评价文字内容):具体代码如下:我这里是把数据爬取之后放到了gsheet中,用了多线程和代理。欢迎提问~

目标

获取某大型国外电商网站数据如何用爬虫抓取电商平台数据,链接

本次目标是爬取热销top女装类目数据如何用爬虫抓取电商平台数据,如下:

具体字段包括sku详情页标题、价格、主图等:

以及所有评论(评价时间、评价星级、购买产品属性明细-颜色&尺寸、评价文字内容):

具体代码如下:

# -*- coding: utf-8 -*-import mathimport reimport timeimport requestsfrom loguru import loggerfrom bs4 import BeautifulSoupfrom concurrent.futures import ThreadPoolExecutorfrom utils.request import Requestfrom utils.common import sleep2, open_gsheet, gsheet_append_rows, gsheet_append_rowclass Amazon:    def __init__(self):        self.headers = {            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',            'accept-encoding': 'gzip, deflate, br',            'accept-language': 'zh-CN,zh;q=0.9',            'cookie': 'aws-target-data=%7B%22support%22%3A%221%22%7D; aws-target-visitor-id=1658219333848-161299.32_0; regStatus=pre-register; AMCV_7742037254C95E840A4C98A6%40AdobeOrg=1585540135%7CMCIDTS%7C19193%7CMCMID%7C83739249770090151470991008238048993563%7CMCAAMLH-1658824133%7C3%7CMCAAMB-1658824133%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1658226534s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C4.4.0; session-id=139-5736486-7904207; i18n-prefs=USD; lc-main=zh_CN; ubid-main=131-1149456-9803809; sp-cdn="L5Z9:HK"; session-id-time=2082787201l; session-token="dW/9hnHEq1nmBVnrkKUH15nDtZkBICOXbe1BE8OEWhllybEQ+H31J6NXaSiBTzzNmOv60c8EBGA4GPla0ng0Q3RzWm6akOwxNOWeN0iFCMxh/uJ3qua0+x+5FFoIZch7r4f864Kvpaa1oOIqmGvimXVeedFsUAUTqCltiV9F3G6DGCHFRPWCYCTlxmU91tqg4NDWtLK5Z+osF5rGxyj5Oi/wzGa+1vaFx/ES6bUFZyQ="; csm-hit=tb:s-6PGAB78TGWTJMYP4B7AG|1668503449994&t:1668503450108&adb:adblk_no',            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'        }        self.sheet = open_gsheet('US COMMENTS', 'amazon')        self.rq = Request()    def request_retry(self, url):        i = 0        while i < 10:            try:                # r = requests.get(url, headers=headers, timeout=20)                r = self.rq.requests_get(url, headers=self.headers)                if "Sorry, we just need to make sure you're not a robot" in r.text:                    print("Sorry, we just need to make sure you're not a robot")

爬虫抓取淘宝销量数据_如何用爬虫抓取电商平台数据_电商大数据╠╠用数据驱动电商和商业案例解析

i += 1 sleep2(10) print(f'try {i} times') continue if r.status_code in [404]: i += 1 sleep2(10) print(f'try {i} times, 404') continue break except Exception as e: # 网站原因 i += 1 sleep2(10) print(f'try {i} times, {e.args}') else: raise return r def treat_comment_page(self, id, page): logger.info(f'page: {page}') url = f'https://www.amazon.com/product-reviews/{id}/ref=cm_cr_getr_d_paging_btm_next_{page}?ie=UTF8&reviewerType=all_reviews&pageNumber={page}' r = self.request_retry(url) # r = self.rq.requests_get(url, headers=self.headers) # print(r.status_code, r.text[:1000]) soup = BeautifulSoup(r.text, 'lxml') comments_tag = soup.select('.a-section.review') comments = [] for comment_tag in comments_tag: comment_time = comment_tag.select('.a-row .review-date')[0].text.strip()[3:].strip() # .split(' ')[0] try: comment_style = comment_tag.select('.a-size-mini.a-link-normal.a-color-secondary')[0].text.strip() except: comment_style = '' comment_content = comment_tag.select('[data-hook="review-body"]')[0].text.replace('n', '').strip() comment_star = comment_tag.select('[data-hook="review-star-rating"] .a-icon-alt')[0].text.strip().split(',')[0][:-2].strip() print(comment_time, comment_star, comment_style, comment_content) comment_data = ['', '', '', '', '', comment_time, comment_star, comment_style, comment_content] comments.append(comment_data) gsheet_append_rows(self.sheet, comments) def parse_exception(self, obj): res = obj.result() if res: print(res)

爬虫抓取淘宝销量数据_电商大数据╠╠用数据驱动电商和商业案例解析_如何用爬虫抓取电商平台数据

def more_comment(self, id): # 先从第二页获取评论总数 logger.info('page: 2') url = f'https://www.amazon.com/product-reviews/{id}/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber=2' r = self.request_retry(url) soup = BeautifulSoup(r.text, 'lxml') comments_tag = soup.select('.a-section.review') comments = [] for comment_tag in comments_tag: comment_time = comment_tag.select('.a-row .review-date')[0].text.strip()[3:].strip() # .split(' ')[0] try: comment_style = comment_tag.select('.a-size-mini.a-link-normal.a-color-secondary')[0].text.strip() except: comment_style = '' comment_content = comment_tag.select('[data-hook="review-body"]')[0].text.replace('n', '').strip() comment_star = comment_tag.select('[data-hook="review-star-rating"] .a-icon-alt')[0].text.strip().split(',')[0][:-2].strip() print(comment_time, comment_star, comment_style, comment_content) comment_data = ['', '', '', '', '', comment_time, comment_star, comment_style, comment_content] comments.append(comment_data) gsheet_append_rows(self.sheet, comments) total_comment = int(soup.select('[data-hook="cr-filter-info-review-rating-count"]')[0].text.strip().split('总评分,')[-1].replace('带评论', '').replace(',', '').strip()) print('total_comment', total_comment) total_page = math.ceil(total_comment / 10) exec = ThreadPoolExecutor(max_workers=3) for page in range(3, total_page + 1): exec.submit(self.treat_comment_page, id, page).add_done_callback(self.parse_exception) exec.shutdown(wait=True) def detail(self, ue_sid, id, referer): headers = self.headers.copy() headers.update({'referer': referer}) url = f'https://www.amazon.com/dp/{id}/{ue_sid}?psc=1' print(url) # response = self.request_retry(url, headers) response = self.rq.requests_get(url, headers=headers) print(response.status_code, response.text[:1000]) soup = BeautifulSoup(response.text, 'lxml') title = soup.select('#productTitle')[0].text.strip() price = soup.select('.a-price span')[0].text.strip() img_url = soup.select('.imgTagWrapper img')[0].attrs['src'] img_url = f'=IMAGE("{img_url}")' category = '' comments_tag = soup.select('#cm-cr-dp-review-list .a-section.review.aok-relative')

爬虫抓取淘宝销量数据_电商大数据╠╠用数据驱动电商和商业案例解析_如何用爬虫抓取电商平台数据

sku_data = [title, price, img_url, category, url] gsheet_append_row(self.sheet, sku_data) comments = [] print(title, price, img_url, category) for comment_tag in comments_tag: comment_time = comment_tag.select('.a-row .review-date')[0].text.strip()[3:].strip() # .split(' ')[0] comment_style = comment_tag.select('[data-hook="format-strip-linkless"]')[0].text.strip() comment_content = comment_tag.select('.a-expander-content span')[0].text.strip() comment_star = comment_tag.select('.a-icon-alt')[0].text.strip().split(',')[0][:-2].strip() print(comment_time, comment_star, comment_style, comment_content) comment_data = ['', '', '', '', '', comment_time, comment_star, comment_style, comment_content] comments.append(comment_data) gsheet_append_rows(self.sheet, comments) self.more_comment(id) def treat_one_page(self, url, page): url = url.split('=')[0] + '=' + str(page) response = self.request_retry(url) ue_sid_pattern = re.compile("ue_sid = '(.*?)'") id_pattern = re.compile('"id":"(.*?)",', re.S) ue_sid = re.search(ue_sid_pattern, response.text).group(1) ids = re.findall(id_pattern, response.text) print(ue_sid, ids) for idx, id in enumerate(ids): logger.info(f'{idx}/{len(ids)} {id}') self.detail(ue_sid, id, url) break def main(self): urls = ['https://www.amazon.com/-/zh/Best-Sellers-/zgbs/fashion/1040660/?pg=1'] # top女装目录链接 for url in urls: # 取前2页 for page in range(1, 2): logger.info(f'url: {url}, page:{page}') self.treat_one_page(url, page) if __name__ == '__main__': a = Amazon() a.main()

我这里是把数据爬取之后放到了gsheet中,用了多线程和代理。欢迎提问~

【ATTENTION】仅技术交流,勿作其他用途

发表评论

邮箱地址不会被公开。 必填项已用*标注

联系我们

400-800-8888

在线咨询:点击这里给我发消息

邮件:admin@example.com

工作时间:周一至周五,9:30-18:30,节假日休息