批量下载静止卫星的python代码

最新推荐文章于 2025-11-24 15:28:47 发布
原创最新推荐文章于 2025-11-24 15:28:47 发布 · 408 阅读
7 ·
CC 4.0 BY-SA版权
文章标签：
#python #数据库 #开发语言
数据处理专栏收录该内容
15 篇文章
订阅专栏
下载静止卫星水汽通量和红外波段的代码

1.代码

# -*- coding: utf-8 -*-
"""
Created on Fri Sep 12 15:29:53 2025

@author: heklong
"""
import os
import requests
import concurrent.futures
from typing import Dict, List

BASE_URL = "https://www.ncei.noaa.gov/data/geostationary-ir-channel-brightness-temperature-gridsat-b1/access"

def load_missing_from_txt(txt_path: str) -> Dict[int, List[str]]:
    """
    从 txt 文件读取缺失文件名，每行类似 GRIDSAT-B1.2018.03.23.06_irwvp
    转成 GRIDSAT-B1.2018.03.23.06.v02r01.nc，并按年份分类
    """
    year_map: Dict[int, List[str]] = {}
    with open(txt_path, "r", encoding="utf-8") as f:
        for line in f:
            name = line.strip()
            if not name or name.startswith("#"):
                continue
            # 取年份
            try:
                year = int(name.split(".")[1])
            except Exception:
                continue
            # 替换后缀
            fixed_name = name.replace("_irwin_cdr", ".v02r01.nc")
            year_map.setdefault(year, []).append(fixed_name)
    return year_map


def ensure_year_dir(out_root: str, year: int) -> str:
    save_dir = os.path.join(out_root, str(year))
    os.makedirs(save_dir, exist_ok=True)
    return save_dir


def download_one(year: int, fname: str, out_root: str, timeout=30, chunk=1024*1024):
    """下载单个缺失文件"""
    save_dir = ensure_year_dir(out_root, year)
    url = f"{BASE_URL}/{year}/{fname}"
    out_path = os.path.join(save_dir, fname)

    if os.path.exists(out_path):
        print(f"[SKIP] 已存在: {out_path}")
        return

    try:
        with requests.get(url, stream=True, timeout=timeout) as r:
            if r.status_code != 200:
                print(f"[FAIL] {fname} HTTP {r.status_code}: {url}")
                return
            ctype = r.headers.get("Content-Type", "")
            if "text/html" in ctype.lower():
                print(f"[FAIL] {fname} 不是数据文件: {url}")
                return

            with open(out_path, "wb") as f:
                for chunk_bytes in r.iter_content(chunk_size=chunk):
                    if chunk_bytes:
                        f.write(chunk_bytes)
        print(f"[OK]  {fname}")
    except Exception as e:
        print(f"[ERR] {fname}: {e}")


def download_missing_list(out_root: str, year_to_files: Dict[int, List[str]], max_workers=6):
    """并发下载缺失清单"""
    tasks = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as ex:
        for year, files in year_to_files.items():
            for fname in files:
                tasks.append(ex.submit(download_one, year, fname, out_root))
        for fut in concurrent.futures.as_completed(tasks):
            pass  # 已在 download_one 打印


if __name__ == "__main__":
    # 缺失清单 txt
    txt_missing = r"H:\Gridsat\2017\missing_2017.txt"
    output_path = r"H:\Gridsat\2017"

    missing_files = load_missing_from_txt(txt_missing)
    download_missing_list(output_path, missing_files, max_workers=2)

# import os
# import requests
# from bs4 import BeautifulSoup
# import concurrent.futures
# from typing import Dict, List

# BASE_URL = "https://www.ncei.noaa.gov/data/geostationary-ir-channel-brightness-temperature-gridsat-b1/access"

# # ============ 方式 A：在代码中直接给出缺失清单 ============
# # 每个年份对应一个文件名列表（文件名需与站点一致）



# MISSING: Dict[int, List[str]] = {
#     2018: [
#         # 例子：把你的缺失文件名填到这里（含 .nc 后缀）
#         "GRIDSAT-B1.2018.01.21.15.v02r01.nc",
#         # "GRIDSAT-B1.2018.01.01.03.v02r01.nc",
#     ],
#     2019: [
#         "GRIDSAT-B1.2019.01.01.00.v02r01.nc",
#     ]
# }

# # ============ 方式 B：从 txt 清单读取（可选） ============
# # txt 每行一个文件名（例如：GRIDSAT-B1.2018.01.01.00.v02r01.nc）
# # 如果你想用 txt，把上面的 MISSING 留空或删掉对应年份，然后启用下面的函数
# def load_missing_from_txt(txt_path: str) -> Dict[int, List[str]]:
#     year_map: Dict[int, List[str]] = {}
#     with open(txt_path, "r", encoding="utf-8") as f:
#         for line in f:
#             name = line.strip()
#             if not name or name.startswith("#"):
#                 continue
#             # 尝试从文件名中提取年份（GRIDSAT-B1.YYYY.MM.DD.HH.v02r01.nc）
#             try:
#                 year = int(name.split(".")[1])
#             except Exception:
#                 continue
#             year_map.setdefault(year, []).append(name)
#     return year_map


# def ensure_year_dir(out_root: str, year: int) -> str:
#     save_dir = os.path.join(out_root, str(year))
#     os.makedirs(save_dir, exist_ok=True)
#     return save_dir


# def download_one(year: int, fname: str, out_root: str, timeout=30, chunk=1024*1024):
#     """下载单个缺失文件（存在则跳过）"""
#     save_dir = ensure_year_dir(out_root, year)
#     url = f"{BASE_URL}/{year}/{fname}"
#     out_path = os.path.join(save_dir, fname)

#     if os.path.exists(out_path):
#         print(f"[SKIP] 已存在: {out_path}")
#         return

#     try:
#         with requests.get(url, stream=True, timeout=timeout) as r:
#             if r.status_code != 200:
#                 print(f"[FAIL] {fname} HTTP {r.status_code}: {url}")
#                 return
#             # 可选：简单检查一下目录页是否返回了 HTML（不是二进制文件）
#             ctype = r.headers.get("Content-Type", "")
#             if "text/html" in ctype.lower():
#                 print(f"[FAIL] {fname} 看起来不是数据文件（Content-Type={ctype}）: {url}")
#                 return

#             with open(out_path, "wb") as f:
#                 for chunk_bytes in r.iter_content(chunk_size=chunk):
#                     if chunk_bytes:
#                         f.write(chunk_bytes)
#         print(f"[OK]  {fname}")
#     except Exception as e:
#         print(f"[ERR] {fname}: {e}")


# def download_missing_list(out_root: str, year_to_files: Dict[int, List[str]], max_workers=4):
#     """并发下载缺失清单"""
#     tasks = []
#     with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as ex:
#         for year, files in year_to_files.items():
#             for fname in files:
#                 tasks.append(ex.submit(download_one, year, fname, out_root))
#         for fut in concurrent.futures.as_completed(tasks):
#             # 已在 download_one 里处理打印，这里不需要额外处理
#             pass


# if __name__ == "__main__":
#     # 输出目录
#     output_path = r"H:\Gridsat"

#     # —— 如果你要从 txt 读取，取消下一行注释并指定清单路径，然后把 MISSING 按需合并/替换 ——
#     # txt_missing = r"H:\Gridsat\missing_irwin_cdr_2018_2019.txt"
#     # missing_from_txt = load_missing_from_txt(txt_missing)
#     # MISSING = missing_from_txt  # 或者把两者合并：MISSING.update(missing_from_txt)

#     # 开始下载缺失
#     download_missing_list(output_path, MISSING, max_workers=4)