import requests
import os
import json
from bs4 import BeautifulSoup
from urllib.parse import quote

# ==========================================================
#                      可配置参数
# ==========================================================
# 1. 你想搜索的关键词
SEARCH_KEYWORD = "薄荷"

# 2. 你想下载的图片数量
DOWNLOAD_LIMIT = 50

# 3. 图片保存的根目录 (脚本会自动创建以关键词命名的子文件夹)
OUTPUT_DIR = "bing_images"


# ==========================================================


class BingImageScraper:
    def __init__(self, keyword, limit, output_dir):
        self.base_url = "https://cn.bing.com/images/search"
        self.keyword = keyword
        self.limit = limit
        self.output_dir = os.path.join(output_dir, keyword)
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        self.downloaded_count = 0

        # 确保输出目录存在
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
            print(f"创建文件夹: {self.output_dir}")

    def fetch_images(self):
        print(f"开始爬取 '{self.keyword}' 的图片...")

        # 通过循环和调整 first 参数来实现翻页
        while self.downloaded_count < self.limit:
            # 构造请求 URL
            # first 参数控制分页，每次请求后递增
            params = {
                'q': self.keyword,
                'form': 'HDRSC2',
                'first': self.downloaded_count + 1
            }

            try:
                response = requests.get(self.base_url, params=params, headers=self.headers, timeout=10)
                response.raise_for_status()  # 如果请求失败则抛出异常
            except requests.exceptions.RequestException as e:
                print(f"请求搜索页面失败: {e}")
                break

            # 使用 BeautifulSoup 解析 HTML
            soup = BeautifulSoup(response.text, 'lxml')

            # 查找所有包含图片信息的 <a> 标签
            # Bing 将图片数据存储在 class="iusc" 的 a 标签的 "m" 属性中
            image_elements = soup.find_all('a', class_='iusc')

            if not image_elements:
                print("没有找到更多图片，或者页面结构已改变。")
                break

            for element in image_elements:
                if self.downloaded_count >= self.limit:
                    break

                # "m" 属性是一个 JSON 字符串，包含图片元数据
                m_attr = element.get('m')
                if not m_attr:
                    continue

                try:
                    # 解析 JSON 数据
                    m_data = json.loads(m_attr)
                    # 获取高清图片 URL
                    image_url = m_data.get('murl')

                    if image_url:
                        self.download_image(image_url)

                except (json.JSONDecodeError, KeyError) as e:
                    # print(f"解析图片元数据失败: {e}")
                    continue

        print(f"\n任务完成！总共下载了 {self.downloaded_count} 张图片。")

    def download_image(self, url):
        try:
            print(f"[{self.downloaded_count + 1}/{self.limit}] 正在下载: {url}")

            img_response = requests.get(url, headers=self.headers, timeout=15, stream=True)
            img_response.raise_for_status()

            # 从 URL 中获取文件名和扩展名
            filename = url.split('/')[-1].split('?')[0]
            if '.' not in filename:  # 如果没有扩展名，则默认 .jpg
                filename += '.jpg'

            # 防止文件名过长或包含非法字符 (简单处理)
            safe_filename = f"{self.downloaded_count + 1}_{''.join(c for c in filename if c.isalnum() or c in '._-')[-50:]}"
            filepath = os.path.join(self.output_dir, safe_filename)

            # 以二进制写模式保存图片
            with open(filepath, 'wb') as f:
                for chunk in img_response.iter_content(1024):
                    f.write(chunk)

            self.downloaded_count += 1

        except requests.exceptions.RequestException as e:
            print(f"下载失败: {url}, 错误: {e}")
        except Exception as e:
            print(f"保存文件时发生未知错误: {e}")


if __name__ == "__main__":
    # 创建爬虫实例并开始执行
    scraper = BingImageScraper(
        keyword=SEARCH_KEYWORD,
        limit=DOWNLOAD_LIMIT,
        output_dir=OUTPUT_DIR
    )
    scraper.fetch_images()