如何用爬虫抓取电商平台数据2016年爬取热销top女装类目数据(ATTENTION-)

目标
获取某大型国外电商网站数据如何用爬虫抓取电商平台数据，链接
本次目标是爬取热销top女装类目数据如何用爬虫抓取电商平台数据，如下：
具体字段包括sku详情页标题、价格、主图等：
以及所有评论（评价时间、评价星级、购买产品属性明细-颜色&尺寸、评价文字内容）：
具体代码如下：
# -*- coding: utf-8 -*-import mathimport reimport timeimport requestsfrom loguru import loggerfrom bs4 import BeautifulSoupfrom concurrent.futures import ThreadPoolExecutorfrom utils.request import Requestfrom utils.common import sleep2, open_gsheet, gsheet_append_rows, gsheet_append_rowclass Amazon:    def __init__(self):        self.headers = {            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',            'accept-encoding': 'gzip, deflate, br',            'accept-language': 'zh-CN,zh;q=0.9',            'cookie': 'aws-target-data=%7B%22support%22%3A%221%22%7D; aws-target-visitor-id=1658219333848-161299.32_0; regStatus=pre-register; AMCV_7742037254C95E840A4C98A6%40AdobeOrg=1585540135%7CMCIDTS%7C19193%7CMCMID%7C83739249770090151470991008238048993563%7CMCAAMLH-1658824133%7C3%7CMCAAMB-1658824133%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1658226534s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C4.4.0; session-id=139-5736486-7904207; i18n-prefs=USD; lc-main=zh_CN; ubid-main=131-1149456-9803809; sp-cdn="L5Z9:HK"; session-id-time=2082787201l; session-token="dW/9hnHEq1nmBVnrkKUH15nDtZkBICOXbe1BE8OEWhllybEQ+H31J6NXaSiBTzzNmOv60c8EBGA4GPla0ng0Q3RzWm6akOwxNOWeN0iFCMxh/uJ3qua0+x+5FFoIZch7r4f864Kvpaa1oOIqmGvimXVeedFsUAUTqCltiV9F3G6DGCHFRPWCYCTlxmU91tqg4NDWtLK5Z+osF5rGxyj5Oi/wzGa+1vaFx/ES6bUFZyQ="; csm-hit=tb:s-6PGAB78TGWTJMYP4B7AG|1668503449994&t:1668503450108&adb:adblk_no',            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'        }        self.sheet = open_gsheet('US COMMENTS', 'amazon')        self.rq = Request()    def request_retry(self, url):        i = 0        while i < 10:            try:                # r = requests.get(url, headers=headers, timeout=20)                r = self.rq.requests_get(url, headers=self.headers)                if "Sorry, we just need to make sure you're not a robot" in r.text:                    print("Sorry, we just need to make sure you're not a robot")
                    i += 1
                    sleep2(10)
                    print(f'try  {i}  times')
                    continue
                if r.status_code in [404]:
                    i += 1
                    sleep2(10)
                    print(f'try  {i}  times, 404')
                    continue
                break
            except Exception as e:  # 网站原因
                i += 1
                sleep2(10)
                print(f'try {i} times, {e.args}')
        else:
            raise
        return r
    def treat_comment_page(self, id, page):
        logger.info(f'page: {page}')
        url = f'https://www.amazon.com/product-reviews/{id}/ref=cm_cr_getr_d_paging_btm_next_{page}?ie=UTF8&reviewerType=all_reviews&pageNumber={page}'
        r = self.request_retry(url)
        # r = self.rq.requests_get(url, headers=self.headers)
        # print(r.status_code, r.text[:1000])
        soup = BeautifulSoup(r.text, 'lxml')
        comments_tag = soup.select('.a-section.review')
        comments = []
        for comment_tag in comments_tag:
            comment_time = comment_tag.select('.a-row .review-date')[0].text.strip()[3:].strip()  # .split(' ')[0]
            try:
                comment_style = comment_tag.select('.a-size-mini.a-link-normal.a-color-secondary')[0].text.strip()
            except:
                comment_style = ''
            comment_content = comment_tag.select('[data-hook="review-body"]')[0].text.replace('n', '').strip()
            comment_star = comment_tag.select('[data-hook="review-star-rating"] .a-icon-alt')[0].text.strip().split('，')[0][:-2].strip()
            print(comment_time, comment_star, comment_style, comment_content)
            comment_data = ['', '', '', '', '', comment_time, comment_star, comment_style, comment_content]
            comments.append(comment_data)
        gsheet_append_rows(self.sheet, comments)
    def parse_exception(self, obj):
        res = obj.result()
        if res:
            print(res)

    def more_comment(self, id):
        # 先从第二页获取评论总数
        logger.info('page: 2')
        url = f'https://www.amazon.com/product-reviews/{id}/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber=2'
        r = self.request_retry(url)
        soup = BeautifulSoup(r.text, 'lxml')
        comments_tag = soup.select('.a-section.review')
        comments = []
        for comment_tag in comments_tag:
            comment_time = comment_tag.select('.a-row .review-date')[0].text.strip()[3:].strip()  # .split(' ')[0]
            try:
                comment_style = comment_tag.select('.a-size-mini.a-link-normal.a-color-secondary')[0].text.strip()
            except:
                comment_style = ''
            comment_content = comment_tag.select('[data-hook="review-body"]')[0].text.replace('n', '').strip()
            comment_star = comment_tag.select('[data-hook="review-star-rating"] .a-icon-alt')[0].text.strip().split('，')[0][:-2].strip()
            print(comment_time, comment_star, comment_style, comment_content)
            comment_data = ['', '', '', '', '', comment_time, comment_star, comment_style, comment_content]
            comments.append(comment_data)
        gsheet_append_rows(self.sheet, comments)
        total_comment = int(soup.select('[data-hook="cr-filter-info-review-rating-count"]')[0].text.strip().split('总评分,')[-1].replace('带评论', '').replace(',', '').strip())
        print('total_comment', total_comment)
        total_page = math.ceil(total_comment / 10)
        exec = ThreadPoolExecutor(max_workers=3)
        for page in range(3, total_page + 1):
            exec.submit(self.treat_comment_page, id, page).add_done_callback(self.parse_exception)
        exec.shutdown(wait=True)
    def detail(self, ue_sid, id, referer):
        headers = self.headers.copy()
        headers.update({'referer': referer})
        url = f'https://www.amazon.com/dp/{id}/{ue_sid}?psc=1'
        print(url)
        # response = self.request_retry(url, headers)
        response = self.rq.requests_get(url, headers=headers)
        print(response.status_code, response.text[:1000])
        soup = BeautifulSoup(response.text, 'lxml')
        title = soup.select('#productTitle')[0].text.strip()
        price = soup.select('.a-price span')[0].text.strip()
        img_url = soup.select('.imgTagWrapper img')[0].attrs['src']
        img_url = f'=IMAGE("{img_url}")'
        category = ''
        comments_tag = soup.select('#cm-cr-dp-review-list .a-section.review.aok-relative')

        sku_data = [title, price, img_url, category, url]
        gsheet_append_row(self.sheet, sku_data)
        comments = []
        print(title, price, img_url, category)
        for comment_tag in comments_tag:
            comment_time = comment_tag.select('.a-row .review-date')[0].text.strip()[3:].strip()   # .split(' ')[0]
            comment_style = comment_tag.select('[data-hook="format-strip-linkless"]')[0].text.strip()
            comment_content = comment_tag.select('.a-expander-content span')[0].text.strip()
            comment_star = comment_tag.select('.a-icon-alt')[0].text.strip().split('，')[0][:-2].strip()
            print(comment_time, comment_star, comment_style, comment_content)
            comment_data = ['', '', '', '', '', comment_time, comment_star, comment_style, comment_content]
            comments.append(comment_data)
        gsheet_append_rows(self.sheet, comments)
        self.more_comment(id)
    def treat_one_page(self, url, page):
        url = url.split('=')[0] + '=' + str(page)
        response = self.request_retry(url)
        ue_sid_pattern = re.compile("ue_sid = '(.*?)'")
        id_pattern = re.compile('"id":"(.*?)",', re.S)
        ue_sid = re.search(ue_sid_pattern, response.text).group(1)
        ids = re.findall(id_pattern, response.text)
        print(ue_sid, ids)
        for idx, id in enumerate(ids):
            logger.info(f'{idx}/{len(ids)} {id}')
            self.detail(ue_sid, id, url)
            break
    def main(self):
        urls = ['https://www.amazon.com/-/zh/Best-Sellers-/zgbs/fashion/1040660/?pg=1']   # top女装目录链接
        for url in urls:
            # 取前2页
            for page in range(1, 2):
                logger.info(f'url: {url}, page:{page}')
                self.treat_one_page(url, page)
if __name__ == '__main__':
        a = Amazon()
        a.main()