program da_system
use mpi
use iso_c_binding
use module_grid
use module_grid_interp
use module_io
use module_data
use module_numerical
use module_svd
use module_process
implicit none
! 观测相关数组 - Observation arrays (节点共享内存)
integer, parameter :: nobs_actual = 883*555 ! 实际观测数量(编译时常量)
real, parameter :: fillvalue = 999999.0
real, pointer :: obs_wind_ptr(:) => null() ! 观测值共享指针
real, pointer :: obs_locations_ptr(:,:) => null() ! 观测位置共享指针
real, pointer :: obs_errors_ptr(:) => null() ! 观测误差共享指针
real, pointer :: obs_weight_ptr(:) => null() ! 观测权重共享指针
! 大数组共享内存指针 - 分为5个变量
real, pointer :: ensemble_pi_ptr(:,:,:,:) => null() ! pi集合数据共享指针
real, pointer :: ensemble_u_ptr(:,:,:,:) => null() ! u集合数据共享指针
real, pointer :: ensemble_v_ptr(:,:,:,:) => null() ! v集合数据共享指针
real, pointer :: ensemble_th_ptr(:,:,:,:) => null() ! th集合数据共享指针
real, pointer :: ensemble_q_ptr(:,:,:,:) => null() ! q集合数据共享指针
type(model_data), pointer :: back_data_ptr => null() ! 背景场共享指针
type(model_data), pointer :: true_data_ptr => null() ! 真值场共享指针
! 本地数据
type(model_data) :: analysis_increment
type(model_data_point) :: analysis_increment_point
real, dimension(ids:ide, kms:kme, jds:jde) :: height_full, height_half
! 主循环变量 - Main loop variables
integer :: i_grid, j_grid, k
integer :: member, i_obs
integer :: i_start, i_end, j_start, j_end ! 局地化范围索引
integer :: myid0_task, myid0_task_total
! 并行参数
integer :: myid, numprocs, ierr
integer :: node_comm, node_rank, node_size
integer :: inter_comm ! 跨节点通信器(仅节点根进程参与)
integer :: win_obs, win_ensemble_pi, win_ensemble_u, win_ensemble_v, win_ensemble_th, win_ensemble_q, win_back, win_true ! 多个共享内存窗口
integer(kind=MPI_ADDRESS_KIND) :: ssize_obs, ssize_data
! 掩码并行相关变量
integer, allocatable :: assimilation_mask(:,:)
integer, allocatable :: my_grid_list(:,:)
integer :: n_valid_points, n_my_points, grid_idx, obs_count
integer :: psize, pstart, pend
! 网格分块相关变量(用于第一阶段掩码生成)
integer :: px, py, px_rank, py_rank
integer :: istart_proc, iend_proc, jstart_proc, jend_proc
integer :: n_local_points
! 第二阶段:动态调度分配(master-worker)
integer, parameter :: TAG_REQUEST = 100, TAG_ASSIGN = 101
integer :: task_id, worker_id, status(MPI_STATUS_SIZE)
integer, allocatable :: all_i_coords(:), all_j_coords(:)
integer :: total_points, idx
! 第三阶段:动态调度主循环
! ====== 动态调度主循环(批量分发+非阻塞) ======
integer, parameter :: TAG_DONE = 102
integer :: tasks_per_worker ! 动态计算每批次任务数
integer, allocatable :: task_ids(:)
! ====== 动态调度主循环相关变量(必须在声明区声明) ======
integer :: next_task, n_workers, request, i, n_this_batch, req, j
logical :: flag
integer :: n_tasks_sent, n_tasks_done
! ====== 任务池机制相关变量 ======
integer, allocatable :: worker_requests(:) ! 每个worker的请求句柄
integer, allocatable :: worker_send_reqs(:) ! 每个worker的发送句柄
integer, allocatable :: worker_status(:,:) ! 每个worker的状态
logical, allocatable :: worker_active(:) ! worker是否活跃
logical, allocatable :: worker_req_pending(:) ! worker请求是否待处理
integer :: n_active_workers, completed_worker, send_req
integer :: task_pool_size, current_task
integer :: target_batches_per_worker ! 每个worker目标批次数
real :: progress_percent ! 进度百分比
integer :: report_interval ! 报告间隔
integer :: print_count
! ================= MPI 初始化与进程信息 =================
call MPI_INIT(ierr) ! 初始化MPI环境,所有进程必须调用
call MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr) ! 获取全局进程号,所有进程调用
call MPI_COMM_SIZE(MPI_COMM_WORLD, numprocs, ierr) ! 获取总进程数,所有进程调用
! 创建节点通信器
call MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, node_comm, ierr)
call MPI_Comm_rank(node_comm, node_rank, ierr) ! 获取节点内进程号
call MPI_Comm_size(node_comm, node_size, ierr) ! 获取节点内进程数
! 创建跨节点通信器(所有进程调用,部分进程返回MPI_COMM_NULL)
if (node_rank == 0) then
call MPI_Comm_split(MPI_COMM_WORLD, 0, myid, inter_comm, ierr)
else
call MPI_Comm_split(MPI_COMM_WORLD, MPI_UNDEFINED, myid, inter_comm, ierr)
end if
! 计算二维进程分块 (按xy比例分配,考虑边缘半径) - 用于第一阶段掩码生成
call compute_process_grid(numprocs, nx-2*local_radius, ny-2*local_radius, px, py)
px_rank = mod(myid, px) ! x方向进程号
py_rank = myid / px ! y方向进程号
! 计算本进程负责的网格区域(基于有效计算区域)
call compute_local_domain(px_rank, py_rank, px, py, nx-2*local_radius, ny-2*local_radius, &
istart_proc, iend_proc, jstart_proc, jend_proc)
! 转换为实际网格坐标:有效计算区域是[local_radius+1, nx-local_radius]
istart_proc = istart_proc + local_radius ! 起始点:1 -> local_radius+1
jstart_proc = jstart_proc + local_radius
iend_proc = iend_proc + local_radius ! 结束点:nx-2*local_radius -> nx-local_radius
jend_proc = jend_proc + local_radius
if (myid == 0) then
write(*,'(A,4I6)') 'DEBUG: Final process domain for mask generation: ', &
istart_proc, iend_proc, jstart_proc, jend_proc
write(*,'(A,2I6)') 'DEBUG: Domain size: ', (iend_proc-istart_proc+1), (jend_proc-jstart_proc+1)
end if
if (myid == 0) then
write(*,'(A)') strline
write(*,'(A)') ' NUDT Regional Data Assimilation'
write(*,'(A)') ' with Shared Memory & Two-Stage Mask-based Parallel'
write(*,'(A)') strline
write(*,'(A,I6)') 'Total processes: ', numprocs
write(*,'(A,3I6)') 'Grid dimensions: ', nx, ny, nz
write(*,'(A,I6)') 'Local radius: ', local_radius
write(*,'(A,2I6)') 'Effective computation area: ', nx-2*local_radius, ny-2*local_radius
write(*,'(A,I6)') 'Ensemble size: ', nsample
write(*,'(A,I6)') 'SVD rank truncation: ', rank_truncation
write(*,'(A,I6)') 'Min observations required: ', min_obs_required
write(*,'(A,2I6)') 'Process grid for mask generation (px,py): ', px, py
write(*,'(A)') strline
end if
! ===== 节点共享内存分配 =====
call setup_shared_memory_obs(node_comm, node_rank, obs_wind_ptr, &
obs_locations_ptr, obs_errors_ptr, &
obs_weight_ptr, nobs_actual, win_obs)
call setup_shared_memory_data(node_comm, node_rank, ensemble_pi_ptr, &
ensemble_u_ptr, ensemble_v_ptr, ensemble_th_ptr, ensemble_q_ptr, &
back_data_ptr, true_data_ptr, &
win_ensemble_pi, win_ensemble_u, win_ensemble_v, win_ensemble_th, win_ensemble_q, &
win_back, win_true)
! 每个节点的根进程负责读取/接收本节点数据
if (node_rank == 0) then
! 全局主进程读取高度数据
if (myid == 0) then
write(*,'(A)') 'Reading height data ...'
call read_height('zz66.dat', height_full, height_half)
write(*,'(A)') 'Height data: OK'
end if
end if
! 节点内同步,确保共享内存数据准备完毕
call MPI_Barrier(node_comm, ierr) ! 所有进程必须调用
! 节点根进程广播高度数据到所有节点根进程
if (node_rank == 0) then
call MPI_Bcast(height_full, size(height_full), MPI_REAL, 0, inter_comm, ierr)
call MPI_Bcast(height_half, size(height_half), MPI_REAL, 0, inter_comm, ierr)
end if
! 读取并存储本节点共享数据
if (node_rank == 0) then
write(*,'(A,I5)') 'Node root process ', myid, ' reading data ...'
call read_and_store_shared_data(ensemble_pi_ptr, ensemble_u_ptr, &
ensemble_v_ptr, ensemble_th_ptr, ensemble_q_ptr, &
back_data_ptr, true_data_ptr, obs_wind_ptr, &
obs_locations_ptr, obs_errors_ptr, &
obs_weight_ptr, nobs_actual, &
height_full, height_half, .false.)
end if
! 节点内同步,确保所有进程完成数据读取
call MPI_Barrier(node_comm, ierr) ! 所有进程必须调用
! 重要:观测位置必须全局统一!只能由0号进程生成一次,然后广播给所有节点
if (myid == 0) then
write(*,'(A)') 'Global master generating observation locations (ONCE for all nodes)...'
call generate_obs_locations(obs_locations_ptr, nobs_actual)
write(*,'(A)') 'Global master observation location generation completed.'
! 调试:检查生成的观测位置
write(*,'(A,2F8.2)') 'Debug: First obs location: ', obs_locations_ptr(1,1), obs_locations_ptr(1,2)
write(*,'(A,2F8.2)') 'Debug: Last obs location: ', obs_locations_ptr(nobs_actual,1), obs_locations_ptr(nobs_actual,2)
write(*,'(A,F8.2)') 'Debug: Min obs i-coord: ', minval(obs_locations_ptr(:,1))
write(*,'(A,F8.2)') 'Debug: Max obs i-coord: ', maxval(obs_locations_ptr(:,1))
write(*,'(A,F8.2)') 'Debug: Min obs j-coord: ', minval(obs_locations_ptr(:,2))
write(*,'(A,F8.2)') 'Debug: Max obs j-coord: ', maxval(obs_locations_ptr(:,2))
end if
! 广播观测位置到所有进程(跨节点)
call MPI_Bcast(obs_locations_ptr, nobs_actual*3, MPI_REAL, 0, MPI_COMM_WORLD, ierr)
! 只需要全局同步,确保所有节点的主进程都完成数据加载
call MPI_Barrier(MPI_COMM_WORLD, ierr) ! 所有进程必须调用
! 初始化本进程的分析增量(model_data类型已经有固定形状,无需allocate)
analysis_increment%pi = fillvalue
analysis_increment%u = fillvalue
analysis_increment%v = fillvalue
analysis_increment%th = fillvalue
analysis_increment%q = fillvalue
! ====== 两阶段掩码筛选与负载均衡分配 ======
! 第一阶段:基于网格分块的并行掩码生成
! 1. 创建二维掩码数组并初始化
allocate(assimilation_mask(local_radius+1:nx-local_radius, local_radius+1:ny-local_radius))
assimilation_mask = 0
if (myid == 0) then
write(*,'(A)') '====== DEBUG: Entering mask generation stage ======'
write(*,'(A,4I6)') 'Process domain: istart, iend, jstart, jend = ', &
istart_proc, iend_proc, jstart_proc, jend_proc
write(*,'(A,2I6)') 'Mask array bounds: ', local_radius+1, nx-local_radius, local_radius+1, ny-local_radius
write(*,'(A)') 'Stage 1: Generating assimilation mask using grid-block parallelization...'
write(*,'(A,I4,A,I4)') 'Each process handles a ', &
(iend_proc-istart_proc+1), ' x ', (jend_proc-jstart_proc+1), ' block'
! 调试:检查观测数据
write(*,'(A,I8)') 'Total observations: ', nobs_actual
write(*,'(A,F8.2,F8.2)') 'First observation location: ', &
obs_locations_ptr(1,1), obs_locations_ptr(1,2)
write(*,'(A,F8.2,F8.2)') 'Last observation location: ', &
obs_locations_ptr(nobs_actual,1), obs_locations_ptr(nobs_actual,2)
write(*,'(A,I4)') 'Local radius: ', local_radius
write(*,'(A,I4)') 'Min observations required: ', min_obs_required
write(*,'(A,I4,A,I4)') 'Valid grid range: i=', local_radius+1, ' to ', nx-local_radius
write(*,'(A,I4,A,I4)') 'Valid grid range: j=', local_radius+1, ' to ', ny-local_radius
end if
! 2. 每个进程并行生成自己负责区域的掩码
if (myid == 0) then
write(*,'(A)') '====== DEBUG: Starting mask generation loop ======'
write(*,'(A,4I6)') 'Loop bounds: jstart, jend, istart, iend = ', &
jstart_proc, jend_proc, istart_proc, iend_proc
end if
do j_grid = jstart_proc, jend_proc
do i_grid = istart_proc, iend_proc
obs_count = 0
! 统计该网格点周围local_radius范围内的观测数量
do k = 1, nobs_actual
! 检查观测位置是否在有效网格范围内
if (obs_locations_ptr(k,1) >= local_radius+1 .and. obs_locations_ptr(k,1) <= nx-local_radius .and. &
obs_locations_ptr(k,2) >= local_radius+1 .and. obs_locations_ptr(k,2) <= ny-local_radius) then
! 然后检查是否在当前网格点的local_radius范围内
if (abs(nint(obs_locations_ptr(k,1)) - i_grid) <= local_radius .and. &
abs(nint(obs_locations_ptr(k,2)) - j_grid) <= local_radius) then
obs_count = obs_count + 1
end if
end if
end do
! 如果观测数量满足要求,标记为可同化
if (obs_count >= min_obs_required) then
assimilation_mask(i_grid, j_grid) = 1
end if
! 调试:输出前几个网格点的观测统计(只在进程0上输出)
if (myid == 0 .and. ((i_grid-istart_proc)*(jend_proc-jstart_proc+1) + (j_grid-jstart_proc+1)) <= 10) then
write(*,'(A,I4,A,I4,A,I4,A,I4)') 'Grid (', i_grid, ',', j_grid, ') has ', obs_count, ' observations, mask=', assimilation_mask(i_grid, j_grid)
end if
end do
end do
! 3. 全局归约掩码:所有进程的掩码合并
call MPI_Allreduce(MPI_IN_PLACE, assimilation_mask, size(assimilation_mask), &
MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, ierr)
! 4. 统计所有可同化网格点数量
n_valid_points = count(assimilation_mask >= 1) ! 注意:可能有重复统计,用>=1
if (myid == 0) then
! 调试:统计掩码分布
write(*,'(A,I8)') 'Debug: Raw valid points (before normalization): ', n_valid_points
write(*,'(A,I8)') 'Debug: Max mask value: ', maxval(assimilation_mask)
write(*,'(A,I8)') 'Debug: Points with mask >= 1: ', count(assimilation_mask >= 1)
write(*,'(A,I8)') 'Debug: Points with mask > 1: ', count(assimilation_mask > 1)
end if
! 5. 将掩码标准化为0/1
where (assimilation_mask >= 1)
assimilation_mask = 1
elsewhere
assimilation_mask = 0
end where
! 重新统计标准化后的有效网格点
n_valid_points = count(assimilation_mask == 1)
if (myid == 0) then
write(*,'(A)') 'Stage 2: Redistributing valid grid points for load balancing...'
write(*,'(A,I8)') 'Total valid points to distribute: ', n_valid_points
write(*,'(A,I8)') 'Points per process (average): ', n_valid_points / numprocs
write(*,'(A,I8)') 'Last process will handle: ', n_valid_points - (numprocs-1)*(n_valid_points/numprocs)
end if
if (n_valid_points > 0) then
! 生成所有有效网格点坐标
total_points = n_valid_points
allocate(all_i_coords(total_points), all_j_coords(total_points))
idx = 0
do j_grid = local_radius+1, ny-local_radius
do i_grid = local_radius+1, nx-local_radius
if (assimilation_mask(i_grid, j_grid) == 1) then
idx = idx + 1
all_i_coords(idx) = i_grid
all_j_coords(idx) = j_grid
end if
end do
end do
if (myid == 0) then
write(*,'(A)') 'Stage 2: Dynamic scheduling (master-worker) for load balancing...'
write(*,'(A,I8)') 'Total valid points to distribute: ', n_valid_points
end if
end if
call MPI_Barrier(MPI_COMM_WORLD, ierr)
! ====== 动态计算任务分配策略 ======
if (n_valid_points > 0) then
! 计算合理的批次大小:确保每个进程都有足够的工作
n_workers = numprocs - 1
if (n_workers > 0) then
! 基础批次大小:总任务数 / (进程数 * 目标批次数)
! 目标是让每个进程处理多个批次,保持负载均衡
target_batches_per_worker = 5 ! 每个worker目标批次数
tasks_per_worker = max(1, n_valid_points / (n_workers * target_batches_per_worker))
! 限制批次大小在合理范围内
tasks_per_worker = max(1, min(tasks_per_worker, 100))
write(*,'(A,I8,A,I4,A,I4)') 'Task distribution: ', n_valid_points, ' tasks, ', &
n_workers, ' workers, batch size: ', tasks_per_worker
write(*,'(A,I6)') 'Estimated total batches: ', (n_valid_points + tasks_per_worker - 1) / tasks_per_worker
else
tasks_per_worker = n_valid_points ! 单进程情况
end if
else
tasks_per_worker = 1
end if
print_count = 0
! 第三阶段:动态调度主循环
! ====== 任务池机制(完全非阻塞,无排队等待) ======
allocate(task_ids(tasks_per_worker))
if (n_valid_points == 0) then
if (myid <= 3) then
write(*,'(A,I4,A)') 'Process ', myid, ' assigned 0 grid points, skipping assimilation loop.'
end if
! 重要:即使没有任务,worker也要通知master自己已完成
if (myid /= 0) then
! Worker发送完成信号,让master知道自己没有工作要做
call MPI_Send(myid, 1, MPI_INTEGER, 0, TAG_DONE, MPI_COMM_WORLD, ierr)
end if
else
if (myid == 0) then
! ====== Master进程:任务池调度器 ======
n_workers = numprocs - 1
if (n_valid_points == 0) then
! 特殊情况:没有任务时,直接等待所有worker的TAG_DONE信号
write(*,'(A)') 'Master: No tasks to assign, waiting for all workers to report completion...'
do i = 1, n_workers
call MPI_Recv(request, 1, MPI_INTEGER, i, TAG_DONE, MPI_COMM_WORLD, status, ierr)
write(*,'(A,I4,A)') 'Master: received completion signal from worker ', i, ' (no tasks case)'
end do
write(*,'(A)') 'Master: All workers have reported completion (no tasks case)'
else
! 正常情况:有任务时的调度逻辑
allocate(worker_requests(n_workers))
allocate(worker_send_reqs(n_workers))
allocate(worker_status(MPI_STATUS_SIZE, n_workers))
allocate(worker_active(n_workers))
allocate(worker_req_pending(n_workers))
! 初始化:为每个worker启动非阻塞接收
worker_active = .true.
worker_req_pending = .false.
n_active_workers = n_workers
current_task = 1
n_tasks_sent = 0
n_tasks_done = 0
do i = 1, n_workers
call MPI_Irecv(request, 1, MPI_INTEGER, i, TAG_REQUEST, MPI_COMM_WORLD, worker_requests(i), ierr)
worker_req_pending(i) = .true.
end do
write(*,'(A,I4,A)') 'Master: Task pool initialized with ', n_active_workers, ' workers'
! 主循环:任务池调度
do while (current_task <= n_valid_points .or. n_active_workers > 0)
! 检查是否有worker请求任务
do i = 1, n_workers
if (worker_active(i) .and. worker_req_pending(i)) then
call MPI_Test(worker_requests(i), flag, worker_status(:,i), ierr)
if (flag) then
worker_req_pending(i) = .false.
! 分配任务
if (current_task <= n_valid_points) then
n_this_batch = min(tasks_per_worker, n_valid_points - current_task + 1)
task_ids(1:n_this_batch) = [(current_task + j - 1, j=1,n_this_batch)]
if (n_this_batch < tasks_per_worker) then
task_ids(n_this_batch+1:tasks_per_worker) = -1
end if
current_task = current_task + n_this_batch
n_tasks_sent = n_tasks_sent + 1
write(*,'(A,I4,A,I4,A,I4)') 'Master: assigning batch ', n_tasks_sent, &
' (', n_this_batch, ' tasks) to worker ', i
else
! 无更多任务,发送终止信号
task_ids = -1
worker_active(i) = .false.
n_active_workers = n_active_workers - 1
write(*,'(A,I4,A,I4)') 'Master: terminating worker ', i, &
', active workers: ', n_active_workers
end if
! 非阻塞发送任务
call MPI_Isend(task_ids, tasks_per_worker, MPI_INTEGER, i, TAG_ASSIGN, &
MPI_COMM_WORLD, worker_send_reqs(i), ierr)
end if
end if
end do
! 检查worker完成信号
do i = 1, n_workers
if (worker_active(i) .and. .not. worker_req_pending(i)) then
call MPI_Test(worker_send_reqs(i), flag, worker_status(:,i), ierr)
if (flag) then
! 发送完成,等待worker完成信号
call MPI_Irecv(request, 1, MPI_INTEGER, i, TAG_DONE, MPI_COMM_WORLD, req, ierr)
call MPI_Wait(req, worker_status(:,i), ierr)
n_tasks_done = n_tasks_done + 1
! 重新启动请求接收
if (worker_active(i)) then
call MPI_Irecv(request, 1, MPI_INTEGER, i, TAG_REQUEST, MPI_COMM_WORLD, worker_requests(i), ierr)
worker_req_pending(i) = .true.
end if
end if
end if
end do
! 让出CPU,避免忙等待
if (n_active_workers == 0) exit ! 所有worker都已终止
! 进度报告(每完成一定比例的任务)
if (mod(current_task - 1, max(1, n_valid_points / 20)) == 0 .and. current_task > 1) then
progress_percent = real(current_task - 1) / real(n_valid_points) * 100.0
write(*,'(A,F6.2,A,I6,A,I6,A,I4)') 'Master: progress ', progress_percent, &
'% (', current_task - 1, '/', n_valid_points, '), active workers: ', n_active_workers
end if
end do
write(*,'(A,I4,A,I4)') 'Master: All tasks completed. Sent: ', n_tasks_sent, ', Done: ', n_tasks_done
! 清理资源
deallocate(worker_requests, worker_send_reqs, worker_status, worker_active, worker_req_pending)
end if ! 结束 if (n_valid_points == 0) else 块
else
! ====== Worker进程:持续工作机制 ======
if (n_valid_points == 0) then
! 没有任务时,worker不需要做任何工作,已经在前面发送了TAG_DONE信号
write(*,'(A,I4,A)') 'Worker ', myid, ' has no tasks to process'
else
! 有任务时的正常工作流程
write(*,'(A,I4,A)') 'Worker ', myid, ' starting continuous work mode'
do
! 请求任务
call MPI_Send(myid, 1, MPI_INTEGER, 0, TAG_REQUEST, MPI_COMM_WORLD, ierr)
! 接收任务
call MPI_Recv(task_ids, tasks_per_worker, MPI_INTEGER, 0, TAG_ASSIGN, MPI_COMM_WORLD, status, ierr)
if (task_ids(1) == -1) then
write(*,'(A,I4,A)') 'Worker ', myid, ' received termination signal'
exit
end if
! 处理任务批次
do j = 1, tasks_per_worker
if (task_ids(j) == -1) exit
i_grid = all_i_coords(task_ids(j))
j_grid = all_j_coords(task_ids(j))
call DA_MAIN(i_grid, j_grid, &
obs_wind_ptr, obs_locations_ptr, obs_errors_ptr, nobs_actual, &
ensemble_pi_ptr, ensemble_u_ptr, ensemble_v_ptr, ensemble_th_ptr, ensemble_q_ptr, &
back_data_ptr, true_data_ptr, &
analysis_increment_point, &
print_count)
call combine_point_inc(analysis_increment, analysis_increment_point, i_grid, j_grid)
end do
! 发送完成信号
call MPI_Send(myid, 1, MPI_INTEGER, 0, TAG_DONE, MPI_COMM_WORLD, ierr)
! 每处理若干批次输出一次进度(根据总任务数调整)
report_interval = max(1, n_valid_points / (tasks_per_worker * 50)) ! 50次报告
if (mod(task_ids(1) / tasks_per_worker, report_interval) == 0) then
write(*,'(A,I4,A,I6,A,I6)') 'Worker ', myid, ' processed batch starting at task ', &
task_ids(1), ', batch size: ', count(task_ids /= -1)
end if
end do
write(*,'(A,I4,A)') 'Worker ', myid, ' completed all assigned work'
end if ! 结束 if (n_valid_points == 0) else 块
end if
end if
if (allocated(task_ids)) deallocate(task_ids)
! 6. 清理内存
if (allocated(assimilation_mask)) deallocate(assimilation_mask)
if (allocated(my_grid_list)) deallocate(my_grid_list)
if (allocated(all_i_coords)) deallocate(all_i_coords)
if (allocated(all_j_coords)) deallocate(all_j_coords)
if (myid == monitor_id) then
write(*,'(A)') strline
write(*,'(A)') 'All stages completed successfully!'
write(*,'(A)') strline
end if
! 等待全局结束
if (myid == 0) then
write(*,'(A)') 'Master: Waiting for all processes to reach final barrier...'
end if
call MPI_Barrier(MPI_COMM_WORLD, ierr)
if (myid == 0) then
write(*,'(A)') 'Master: All processes reached final barrier successfully'
end if
! 在归约前输出每个进程的局部统计信息(用于调试)
! if (myid <= 3) then ! 只输出前4个进程的信息
! write(*,'(A,I3,A,I3,A,2ES12.5)') 'Process ', myid, ' (node_rank=', node_rank, ') local π range: ', &
! minval(analysis_increment%pi), maxval(analysis_increment%pi)
! write(*,'(A,I3,A,I3,A,2ES12.5)') 'Process ', myid, ' (node_rank=', node_rank, ') local u range: ', &
! minval(analysis_increment%u), maxval(analysis_increment%u)
! end if
! 全局归约 analysis_increment - 添加调试信息
! if (myid == 0) then
! write(*,'(A)') 'Starting global reduction of analysis increment...'
! write(*,'(A,2ES12.5)') 'Before reduction, process 0 π range: ', &
! minval(analysis_increment%pi), maxval(analysis_increment%pi)
! end if
call global_reduce_analysis_increment(analysis_increment, fillvalue)
if (myid == 0) then
! write(*,'(A)') 'Computing final analysis field ...'
! 输出分析场统计信息
write(*,'(A)') strline
write(*,'(A)') ' Global Analysis Increment Statistics'
write(*,'(A)') strline
write(*,'(A,2ES12.5)') ' π increment range: ', minval(analysis_increment%pi), maxval(analysis_increment%pi)
write(*,'(A,2ES12.5)') ' u increment range: ', minval(analysis_increment%u), maxval(analysis_increment%u)
write(*,'(A,2ES12.5)') ' v increment range: ', minval(analysis_increment%v), maxval(analysis_increment%v)
write(*,'(A,2ES12.5)') ' θ increment range: ', minval(analysis_increment%th), maxval(analysis_increment%th)
write(*,'(A,2ES12.5)') ' q increment range: ', minval(analysis_increment%q), maxval(analysis_increment%q)
write(*,'(A)') strline
write(*,'(A)') 'Writing analysis increment to file ...'
call write_analysis_increment('da_inc.dat', analysis_increment, height_full, height_half)
write(*,'(A)') 'Writing combined analysis increment to GRAPES input format ...'
call write_combine_inc2ana('grapesinput', 'grapesinput_DA', back_data_ptr, analysis_increment, height_full, height_half)
write(*,'(A)') strline
write(*,'(A)') ' Data Assimilation: COMPLETED'
write(*,'(A)') strline
! 强制终止所有进程
write(*,'(A)') 'Master: Analysis completed successfully, forcing program termination...'
call MPI_Abort(MPI_COMM_WORLD, 0, ierr) ! 立即终止所有MPI进程
end if
! 释放共享内存
if (node_rank == 0) then
call MPI_Win_free(win_obs, ierr)
call MPI_Win_free(win_ensemble_pi, ierr)
call MPI_Win_free(win_ensemble_u, ierr)
call MPI_Win_free(win_ensemble_v, ierr)
call MPI_Win_free(win_ensemble_th, ierr)
call MPI_Win_free(win_ensemble_q, ierr)
call MPI_Win_free(win_back, ierr)
call MPI_Win_free(win_true, ierr)
end if
! 释放通信器,所有进程调用
call MPI_Comm_free(node_comm, ierr)
if (node_rank == 0 .and. inter_comm /= MPI_COMM_NULL) then
call MPI_Comm_free(inter_comm, ierr)
end if
! 所有进程准备退出
if (myid == 0) then
write(*,'(A)') 'Master: All processes preparing to finalize MPI...'
end if
! 结束MPI环境,所有进程必须调用
call MPI_FINALIZE(ierr)
if (myid == 0) then
write(*,'(A)') 'Master: MPI finalized successfully, program should exit now'
end if
contains
! 计算二维进程网格分布
subroutine compute_process_grid(numprocs, nx_eff, ny_eff, px, py)
integer, intent(in) :: numprocs, nx_eff, ny_eff
integer, intent(out) :: px, py
integer :: i, best_px, best_py
real :: ratio, grid_ratio, best_ratio_diff
grid_ratio = real(nx_eff) / real(ny_eff)
best_ratio_diff = huge(1.0)
best_px = 1
best_py = numprocs
! 寻找最佳的px*py=numprocs分解,使得px/py接近nx_eff/ny_eff
do i = 1, int(sqrt(real(numprocs))) + 1
if (mod(numprocs, i) == 0) then
ratio = real(i) / real(numprocs / i)
if (abs(ratio - grid_ratio) < best_ratio_diff) then
best_ratio_diff = abs(ratio - grid_ratio)
best_px = i
best_py = numprocs / i
end if
end if
end do
px = best_px
py = best_py
end subroutine compute_process_grid
! 计算本进程负责的局部区域
subroutine compute_local_domain(px_rank, py_rank, px, py, nx_eff, ny_eff, &
istart, iend, jstart, jend)
integer, intent(in) :: px_rank, py_rank, px, py, nx_eff, ny_eff
integer, intent(out) :: istart, iend, jstart, jend
integer :: ilen, jlen, irem, jrem
! 计算每个进程的基本网格数
ilen = nx_eff / px
jlen = ny_eff / py
! 计算余数
irem = mod(nx_eff, px)
jrem = mod(ny_eff, py)
! 计算x方向范围
if (px_rank < irem) then
istart = px_rank * (ilen + 1) + 1
iend = istart + ilen
else
istart = px_rank * ilen + irem + 1
iend = istart + ilen - 1
end if
! 计算y方向范围
if (py_rank < jrem) then
jstart = py_rank * (jlen + 1) + 1
jend = jstart + jlen
else
jstart = py_rank * jlen + jrem + 1
jend = jstart + jlen - 1
end if
end subroutine compute_local_domain
! 设置观测数据共享内存
subroutine setup_shared_memory_obs(node_comm, node_rank, obs_wind_ptr, &
obs_locations_ptr, obs_errors_ptr, &
obs_weight_ptr, nobs_actual, win_obs)
integer, intent(in) :: node_comm, node_rank, nobs_actual
real, pointer, intent(inout) :: obs_wind_ptr(:)
real, pointer, intent(inout) :: obs_locations_ptr(:,:)
real, pointer, intent(inout) :: obs_errors_ptr(:)
real, pointer, intent(inout) :: obs_weight_ptr(:)
integer, intent(out) :: win_obs
integer(kind=MPI_ADDRESS_KIND) :: ssize
integer :: ierr, disp_unit
type(c_ptr) :: baseptr
real, pointer :: shared_array(:)
if (node_rank == 0) then
ssize = int(nobs_actual * 6, MPI_ADDRESS_KIND) ! obs_wind + obs_locations(3) + obs_errors + obs_weight = 6*nobs_actual
else
ssize = 0
end if
disp_unit = 4 ! sizeof(real)
call MPI_Win_allocate_shared(ssize * disp_unit, disp_unit, MPI_INFO_NULL, &
node_comm, baseptr, win_obs, ierr)
if (node_rank /= 0) then
call MPI_Win_shared_query(win_obs, 0, ssize, disp_unit, baseptr, ierr)
end if
! 将C指针转换为Fortran指针
call c_f_pointer(baseptr, shared_array, [nobs_actual * 6])
! 映射到各个数组
obs_wind_ptr => shared_array(1:nobs_actual)
obs_errors_ptr => shared_array(nobs_actual+1:2*nobs_actual)
obs_weight_ptr => shared_array(2*nobs_actual+1:3*nobs_actual)
! 正确映射二维数组obs_locations_ptr(nobs_actual, 3)
! 使用c_loc获取起始位置,然后映射为二维数组
! obs_locations占用shared_array(3*nobs_actual+1 : 6*nobs_actual)的位置
call c_f_pointer(c_loc(shared_array(3*nobs_actual+1)), obs_locations_ptr, [nobs_actual, 3])
end subroutine setup_shared_memory_obs
! 设置大数组共享内存 - 分为5个变量
subroutine setup_shared_memory_data(node_comm, node_rank, ensemble_pi_ptr, &
ensemble_u_ptr, ensemble_v_ptr, ensemble_th_ptr, ensemble_q_ptr, &
back_data_ptr, true_data_ptr, &
win_ensemble_pi, win_ensemble_u, win_ensemble_v, &
win_ensemble_th, win_ensemble_q, win_back, win_true)
integer, intent(in) :: node_comm, node_rank
real, pointer, intent(out) :: ensemble_pi_ptr(:,:,:,:), ensemble_u_ptr(:,:,:,:), ensemble_v_ptr(:,:,:,:), &
ensemble_th_ptr(:,:,:,:), ensemble_q_ptr(:,:,:,:)
type(model_data), pointer, intent(out) :: back_data_ptr, true_data_ptr
integer, intent(out) :: win_ensemble_pi, win_ensemble_u, win_ensemble_v, win_ensemble_th, win_ensemble_q, win_back, win_true
integer(kind=MPI_ADDRESS_KIND) :: ssize_ensemble, ssize_data
integer :: ierr, disp_unit
type(c_ptr) :: baseptr_pi, baseptr_u, baseptr_v, baseptr_th, baseptr_q, baseptr_back, baseptr_true
disp_unit = 4 ! sizeof(real)
! 集合数据大小 - 每个变量单独分配
if (node_rank == 0) then
ssize_ensemble = int(nx * nz * ny * nsample, MPI_ADDRESS_KIND) ! 单个变量
ssize_data = int(nx * nz * ny * 5, MPI_ADDRESS_KIND) ! 5个变量
else
ssize_ensemble = 0
ssize_data = 0
end if
! 分配5个集合变量的共享内存
call MPI_Win_allocate_shared(ssize_ensemble * disp_unit, disp_unit, MPI_INFO_NULL, &
node_comm, baseptr_pi, win_ensemble_pi, ierr)
if (node_rank /= 0) then
call MPI_Win_shared_query(win_ensemble_pi, 0, ssize_ensemble, disp_unit, baseptr_pi, ierr)
end if
call c_f_pointer(baseptr_pi, ensemble_pi_ptr, [nx, nz, ny, nsample])
call MPI_Win_allocate_shared(ssize_ensemble * disp_unit, disp_unit, MPI_INFO_NULL, &
node_comm, baseptr_u, win_ensemble_u, ierr)
if (node_rank /= 0) then
call MPI_Win_shared_query(win_ensemble_u, 0, ssize_ensemble, disp_unit, baseptr_u, ierr)
end if
call c_f_pointer(baseptr_u, ensemble_u_ptr, [nx, nz, ny, nsample])
call MPI_Win_allocate_shared(ssize_ensemble * disp_unit, disp_unit, MPI_INFO_NULL, &
node_comm, baseptr_v, win_ensemble_v, ierr)
if (node_rank /= 0) then
call MPI_Win_shared_query(win_ensemble_v, 0, ssize_ensemble, disp_unit, baseptr_v, ierr)
end if
call c_f_pointer(baseptr_v, ensemble_v_ptr, [nx, nz, ny, nsample])
call MPI_Win_allocate_shared(ssize_ensemble * disp_unit, disp_unit, MPI_INFO_NULL, &
node_comm, baseptr_th, win_ensemble_th, ierr)
if (node_rank /= 0) then
call MPI_Win_shared_query(win_ensemble_th, 0, ssize_ensemble, disp_unit, baseptr_th, ierr)
end if
call c_f_pointer(baseptr_th, ensemble_th_ptr, [nx, nz, ny, nsample])
call MPI_Win_allocate_shared(ssize_ensemble * disp_unit, disp_unit, MPI_INFO_NULL, &
node_comm, baseptr_q, win_ensemble_q, ierr)
if (node_rank /= 0) then
call MPI_Win_shared_query(win_ensemble_q, 0, ssize_ensemble, disp_unit, baseptr_q, ierr)
end if
call c_f_pointer(baseptr_q, ensemble_q_ptr, [nx, nz, ny, nsample])
! 分配背景场共享内存
call MPI_Win_allocate_shared(ssize_data * disp_unit, disp_unit, MPI_INFO_NULL, &
node_comm, baseptr_back, win_back, ierr)
if (node_rank /= 0) then
call MPI_Win_shared_query(win_back, 0, ssize_data, disp_unit, baseptr_back, ierr)
end if
call c_f_pointer(baseptr_back, back_data_ptr)
! 分配真值场共享内存
call MPI_Win_allocate_shared(ssize_data * disp_unit, disp_unit, MPI_INFO_NULL, &
node_comm, baseptr_true, win_true, ierr)
if (node_rank /= 0) then
call MPI_Win_shared_query(win_true, 0, ssize_data, disp_unit, baseptr_true, ierr)
end if
call c_f_pointer(baseptr_true, true_data_ptr)
end subroutine setup_shared_memory_data
! 主进程读取数据并存储到共享内存
subroutine read_and_store_shared_data(ensemble_pi_ptr, ensemble_u_ptr, &
ensemble_v_ptr, ensemble_th_ptr, ensemble_q_ptr, &
back_data_ptr, true_data_ptr, obs_wind_ptr, &
obs_locations_ptr, obs_errors_ptr, &
obs_weight_ptr, nobs_actual, &
height_full, height_half, gen_obs_loc)
use module_svd
use module_io
real, pointer, intent(inout) :: ensemble_pi_ptr(:,:,:,:), ensemble_u_ptr(:,:,:,:), &
ensemble_v_ptr(:,:,:,:), ensemble_th_ptr(:,:,:,:), ensemble_q_ptr(:,:,:,:)
type(model_data), pointer, intent(inout) :: back_data_ptr, true_data_ptr
real, pointer, intent(inout) :: obs_wind_ptr(:), obs_errors_ptr(:), &
obs_locations_ptr(:,:), obs_weight_ptr(:)
integer, intent(in) :: nobs_actual
real, dimension(ids:ide, kms:kme, jds:jde), intent(in) :: height_full, height_half
logical, intent(in), optional :: gen_obs_loc ! 是否生成观测位置
integer :: k, ierr
type(model_ensemble_data) :: temp_ensemble_data ! 临时集合数据结构
logical :: do_gen_obs_loc
if (present(gen_obs_loc)) then
do_gen_obs_loc = gen_obs_loc
else
do_gen_obs_loc = .true.
end if
write(*,'(A)') 'Reading ensemble forecasts ...'
! 先分配临时结构体
allocate(temp_ensemble_data%pi(ids:ide, kms:kme, jds:jde, nsample))
allocate(temp_ensemble_data%u(ids:ide, kms:kme, jds:jde, nsample))
allocate(temp_ensemble_data%v(ids:ide, kms:kme, jds:jde, nsample))
allocate(temp_ensemble_data%th(ids:ide, kms:kme, jds:jde, nsample))
allocate(temp_ensemble_data%q(ids:ide, kms:kme, jds:jde, nsample))
call read_ensemble_forecasts(temp_ensemble_data, height_full, height_half)
! 复制数据到共享内存指针
ensemble_pi_ptr = temp_ensemble_data%pi
ensemble_u_ptr = temp_ensemble_data%u
ensemble_v_ptr = temp_ensemble_data%v
ensemble_th_ptr = temp_ensemble_data%th
ensemble_q_ptr = temp_ensemble_data%q
! 释放临时结构体
deallocate(temp_ensemble_data%pi, temp_ensemble_data%u, temp_ensemble_data%v, &
temp_ensemble_data%th, temp_ensemble_data%q)
write(*,'(A)') 'Ensemble data: OK'
write(*,'(A)') 'Reading background field ...'
call read_background_field(back_data_ptr, height_full, height_half)
write(*,'(A)') 'Background field: OK'
write(*,'(A)') 'Reading true field ...'
call read_truth_field(true_data_ptr, height_full, height_half)
write(*,'(A)') 'True field: OK'
! 观测数据处理
write(*,'(A,I0,A)') 'Processing ', nobs_actual, ' observations ...'
! 观测位置已由全局进程生成并广播,这里不再生成
write(*,'(A)') 'Step 1: Observation locations already generated by global master.'
write(*,'(A)') 'Step 2: Initializing observation arrays ...'
obs_wind_ptr(:) = 0.0 ! 初始化
obs_errors_ptr(:) = 1.005 ! 观测误差
write(*,'(A)') 'Step 2: Observation arrays initialized.'
write(*,'(A)') 'Step 3: Calling get_obs_data ...'
call get_obs_data(nobs_actual, obs_wind_ptr, obs_locations_ptr, obs_errors_ptr, true_data_ptr%u, obs_weight_ptr)
write(*,'(A)') 'Step 3: get_obs_data completed.'
end subroutine read_and_store_shared_data
! 全局归约分析增量 - 仅归约到主进程
subroutine global_reduce_analysis_increment(analysis_increment, fillvalue)
use mpi
type(model_data), intent(inout) :: analysis_increment
integer :: ierr
integer :: array_size
type(model_data) :: global_result ! 全局归约结果
real :: fillvalue
! 计算数组大小:(ide-ids+1)*(kme-kms+1)*(jde-jds+1)
array_size = (ide-ids+1)*(kme-kms+1)*(jde-jds+1)
! 调试:输出归属前的统计信息
if (myid == 0) then
write(*,'(A,2ES12.5)') 'Process 0 before MPI_Reduce π range: ', &
minval(analysis_increment%pi), maxval(analysis_increment%pi)
end if
! 仅归约到主进程(myid==0),不进行广播
! 注意:这里使用MPI_MIN来选择有效的分析增量(避免累加)
! 策略:只有处理过的网格点才有真实值,其他保持fillvalue=999999
call MPI_Reduce(analysis_increment%pi, global_result%pi, &
array_size, MPI_REAL, MPI_MIN, 0, MPI_COMM_WORLD, ierr)
call MPI_Reduce(analysis_increment%u, global_result%u, &
array_size, MPI_REAL, MPI_MIN, 0, MPI_COMM_WORLD, ierr)
call MPI_Reduce(analysis_increment%v, global_result%v, &
array_size, MPI_REAL, MPI_MIN, 0, MPI_COMM_WORLD, ierr)
call MPI_Reduce(analysis_increment%th, global_result%th, &
array_size, MPI_REAL, MPI_MIN, 0, MPI_COMM_WORLD, ierr)
call MPI_Reduce(analysis_increment%q, global_result%q, &
array_size, MPI_REAL, MPI_MIN, 0, MPI_COMM_WORLD, ierr)
! 只有主进程需要更新结果
if (myid == 0) then
where (global_result%pi == fillvalue)
global_result%pi = 0.0
end where
where (global_result%u == fillvalue)
global_result%u = 0.0
end where
where (global_result%v == fillvalue)
global_result%v = 0.0
end where
where (global_result%th == fillvalue)
global_result%th = 0.0
end where
where (global_result%q == fillvalue)
global_result%q = 0.0
end where
analysis_increment%pi = global_result%pi
analysis_increment%u = global_result%u
analysis_increment%v = global_result%v
analysis_increment%th = global_result%th
analysis_increment%q = global_result%q
! 调试:输出归约后的统计信息
! write(*,'(A,2ES12.5)') 'After MPI_Reduce π range: ', &
! minval(analysis_increment%pi), maxval(analysis_increment%pi)
! write(*,'(A,2ES12.5)') 'After MPI_Reduce u range: ', &
! minval(analysis_increment%u), maxval(analysis_increment%u)
end if
end subroutine global_reduce_analysis_increment
end program da_system
最新发布