06_TMDB高分榜单.py

import requests
import csv
from lxml import html


"""
这个部分要结合html网页结构复习
核心就是自己借助浏览器寻找准确html路径xpath
并且随着时间推移网站可能会有变化 所以需要不断更新程序
"""
MOVIE_LIST_FILE = "csv_data/movie_list.csv"
URL = "https://www.themoviedb.org/"
TARGET_URL = "https://www.themoviedb.org/movie/top-rated"#直接访问 默认只访问界面第一页的信息 需要在页面点击“载入更多”才能刷新


#保存电影信息为csv文件
def save_movie(movie_info):
    with open(MOVIE_LIST_FILE,"w",encoding="utf-8",newline="") as csvfile:
        writer = csv.DictWriter(csvfile,fieldnames = ["名称","年份","上映时间","类型","时长","评分","语言","作者","导演","简介","标语"])
        writer.writeheader()
        writer.writerows(movie_info)


#获取电影详情信息
def get_movie_info(movie_url):
    movie_html = requests.get(movie_url,timeout=100).text
    print(f"发送请求给电影{movie_url}")
    movie_doc = html.fromstring(movie_html)
    #获取电影名称
    #去浏览器f12 获取xpath
    movie_name_list = movie_doc.xpath("//*[@id='original_header']/div[2]/section/div[1]/h2/a/text()")
    #获取年份
    movie_year_list = movie_doc.xpath("//*[@id='original_header']/div[2]/section/div[1]/h2/span/text()")
    #获取上映时间
    movie_time_list = movie_doc.xpath("//*[@id='original_header']/div[2]/section/div[1]/div/span[2]/text()")
    #电影类型
    movie_type_list = movie_doc.xpath("//*[@id='original_header']/div[2]/section/div[1]/div/span[3]/a/text()")
    #电影时长
    movie_lasting_list = movie_doc.xpath("//*[@id='original_header']/div[2]/section/div[1]/div/span[4]/text()")
    #电影评分
    #这里需要在html网页里上下文找到需要的属性->data-percent
    movie_score_list = movie_doc.xpath("//*[@id='consensus_pill']/div/div[1]/div/div/@data-percent")
    #电影语言
    movie_language_list = movie_doc.xpath("//*[@id='media_v4']/div/div/div[2]/div/section/div[1]/div/section[1]/p[3]/text()")
    #电影作者
    movie_author_list = movie_doc.xpath("//*[@id='original_header']/div[2]/section/div[3]/ol/li[2]/p[1]/a/text()")
    #电影导演
    movie_director_list = movie_doc.xpath("//*[@id='original_header']/div[2]/section/div[3]/ol/li[1]/p[1]/a/text()")
    #电影简介
    movie_introduction_list = movie_doc.xpath("//*[@id='original_header']/div[2]/section/div[3]/div/p/text()")
    #电影标语
    movie_slogan_list = movie_doc.xpath("//*[@id='original_header']/div[2]/section/div[3]/h3[1]/text()")

    #最后将以上内容封装到字典
    movie_info = {
        #用strip()去除空格
        #用if 判断是否为空
        #用.join()连接列表
        "名称":movie_name_list[0].strip() if movie_name_list else '',
        "年份":movie_year_list[0].strip() if movie_year_list else '',
        "上映时间":movie_time_list[0].strip() if movie_time_list else '',
        "类型":",".join(movie_type_list).strip() if movie_type_list else '',
        "时长":movie_lasting_list[0].strip() if movie_lasting_list else '',
        "评分":movie_score_list[0].strip() if movie_score_list else '',
        "语言":movie_language_list[0].strip() if movie_language_list else '',
        "作者":",".join(movie_author_list).strip() if movie_author_list else '',
        "导演":",".join(movie_director_list).strip() if movie_director_list else '',
        "简介":movie_introduction_list[0].strip() if movie_introduction_list else '',
        "标语":movie_slogan_list[0].strip() if movie_slogan_list else ''
    }
    # print(movie_info)
    return movie_info


#主函数 核心逻辑
def main():
    html_text = requests.get(TARGET_URL,timeout=60).text
    print("正在发送请求...获取TMDB高分榜单数据")
    #timeout表示请求可 超时的时间范围
    doc = html.fromstring(html_text)
    #从浏览器找到html文件中想要的结构路径 复制Xpath
    movie_list = doc.xpath("//*[@id='page_1']/div[@class='card style_1']")
    all_movies = []#用来存储所有电影的信息
    for movie in movie_list:
        movie_url = movie.xpath("./div/div/a/@href")#在当前目录下继续找 找到该部电影的详细url
        if movie_url:#如果存在
            movie_info_url=URL+movie_url[0]#得到该部电影准确url地址!!
            # print(movie_info_url)
            movie_info = get_movie_info(movie_info_url)
            #保存数据为csv文件
            all_movies.append(movie_info)
    print("已经获取所有电影详情 准备保存到csv文件")
    save_movie(all_movies)#存入csv表格


if __name__ == "__main__":#运行时调用
    main()