map-combine过程解

本文详细介绍了MapReduce中的Combiner组件的工作原理及其执行时机。解释了Combiner如何在Map阶段优化数据处理流程,减少磁盘I/O操作,并提供了一个具体的代码示例来说明其使用方式。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

●read阶段: 通过RecordReader从InputSplit分片中将数据解析成一个个key/value。 
          ●map阶段: 将由RecordReader解析出的key/value交给map()方法处理,并生成一个个新的key/value。 
          ●collect阶段: 将map()中新生成key/value由OutpCollector.collect()写入内存中的环形数据缓冲区。 
          ●spill阶段: 当环形缓冲区达到一定阀值后,会将数据写到本地磁盘上,生成一个spill文件。在写文件之前,会先将数据进行一次本地排序,必要的时候(按配置要求)还会对数据进行压缩。 

         ●combine阶段:当所有数据处理完后,将所有的临时的spill文件进行一次合并,最终之生成一个数据文件。 -------任务少、数据量小时如此,其它情况不是;

最后,我们再来看一下Combiner的执行时机。我们之前已对map端的shuffle做过比较升入的了解,详情请看MapTask详解。那么,Combiner会在map端的那个时期执行呢?实际上,Conbiner函数的执行时机可能会在map的merge操作完成之前,也可能在merge之后执行,这个时机由配置参数min.num.spill.for.combine(该值默认为3),也就是说在map端产生的spill文件最少有min.num.spill.for.combine的时候,Conbiner函数会在merge操作合并最终的本机结果文件之前执行,否则在merge之后执行。通过这种方式,就可以在spill文件很多并且需要做conbine的时候,减少写入本地磁盘的数据量,同样也减少了对磁盘的读写频率,可以起到优化作业的目的。

eg:

MAP:

public void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException{
String valueStr = value.toString();
String[] fields = valueStr.trim().split(",");
            context.write(new Text(fields[4].substring(4)),new Text(fields[37]));
}


COMBINE:

// public static class combine extends Reducer<Text,Text,Text,Text>{
// public void reduce(Text key,Iterable<Text> values,Context context) throws IOException, InterruptedException{
// int num = 0;
// float moneyCount = 0;
// for(Text value : values){
// num=num+1;
// moneyCount=Float.parseFloat(value.toString())+moneyCount;
// }
// context.write(key, new Text(String.valueOf(num)+","+String.valueOf(moneyCount)));
// }
// }

因为不是所有的map结束才执行combine,而是递归的去执行(有可能某个combine的结果和map在一起执行combine),所以map的结果必须和combine的结果完全一致,所以上述代码有误。



import numpy as np import matplotlib.pyplot as plt import heapq import time import math from tqdm import tqdm class ThreatMap: def __init__(self, width, height, resolution=100): self.width = width self.height = height self.resolution = resolution # meters per grid cell self.terrain = np.zeros((width, height)) self.radars = [] self.fires = [] self.threat_map = np.zeros((width, height, 3)) # [terrain, radar, fire] self.combined_threat = np.zeros((width, height)) self.weights = np.array([0.2, 0.2, 0.3]) # Initial weights [wg, wR, wAA] def generate_terrain(self, terrain_type='mountain'): if terrain_type == 'mountain': # Generate mountain-like terrain x = np.linspace(0, 10, self.width) y = np.linspace(0, 10, self.height) X, Y = np.meshgrid(x, y) Z = 500 * np.sin(0.5*X) * np.cos(0.5*Y) + 300 self.terrain = Z.T elif terrain_type == 'valley': # Generate valley terrain for i in range(self.width): for j in range(self.height): self.terrain[i, j] = 100 + 50 * np.sin(i/20) + 50 * np.cos(j/20) # Normalize terrain self.terrain = (self.terrain - np.min(self.terrain)) / (np.max(self.terrain) - np.min(self.terrain)) * 1000 def add_radar(self, x, y, direction, max_range, fov=120): self.radars.append({ 'pos': (x, y), 'direction': direction, # in degrees 'max_range': max_range, # in meters 'fov': fov # field of view in degrees }) def add_fire(self, x, y, min_range, max_range): self.fires.append({ 'pos': (x, y), 'min_range': min_range, 'max_range': max_range }) def calculate_terrain_threat(self, x, y, z): h_max = np.max(self.terrain[x, y]) h_min = np.min(self.terrain[x, y]) return (h_max - h_min) / (z - h_max + 1e-5) def calculate_radar_threat(self, x, y, z): threat = 0 for radar in self.radars: rx, ry = radar['pos'] dx = (x - rx) * self.resolution dy = (y - ry) * self.resolution distance = np.sqrt(dx**2 + dy**2) # Check if within max range if distance > radar['max_range']: continue # Check if within field of view angle = math.degrees(math.atan2(dy, dx)) angle_diff = abs((angle - radar['direction'] + 180) % 360 - 180) if angle_diff > radar['fov'] / 2: continue # Calculate radar threat with exponential decay if distance < radar['max_range']: threat += np.exp(-10 * distance / radar['max_range']) return threat def calculate_fire_threat(self, x, y): threat = 0 for fire in self.fires: fx, fy = fire['pos'] dx = (x - fx) * self.resolution dy = (y - fy) * self.resolution distance = np.sqrt(dx**2 + dy**2) if distance < fire['min_range'] or distance > fire['max_range']: continue threat += np.exp(-10 * (distance - fire['min_range']) / (fire['max_range'] - fire['min_range'])) return threat def calculate_threats(self, z=500): for i in range(self.width): for j in range(self.height): # Terrain threat self.threat_map[i, j, 0] = self.calculate_terrain_threat(i, j, z) # Radar threat self.threat_map[i, j, 1] = self.calculate_radar_threat(i, j, z) # Fire threat self.threat_map[i, j, 2] = self.calculate_fire_threat(i, j) def variable_weighting(self, T): """Apply variable weighting based on threat levels""" S = np.array([ 1 + np.exp(T[0]), 1 + 1/(T[1] + 1), 1 + 1/(T[2] + 1) ]) W_prime = self.weights * S / np.sum(self.weights * S) return np.sum(W_prime * T) def combine_threats(self): for i in range(self.width): for j in range(self.height): T = self.threat_map[i, j] self.combined_threat[i, j] = self.variable_weighting(T) def plot_threat_map(self): plt.figure(figsize=(12, 10)) plt.imshow(self.combined_threat.T, origin='lower', cmap='hot') # Plot radars for radar in self.radars: x, y = radar['pos'] plt.scatter(x, y, s=100, c='blue', marker='^', label='Radar') # Plot fires for fire in self.fires: x, y = fire['pos'] plt.scatter(x, y, s=100, c='red', marker='o', label='Fire') plt.colorbar(label='Threat Level') plt.title('Combined Threat Map') plt.xlabel('X (grid cells)') plt.ylabel('Y (grid cells)') plt.legend() plt.show() class ImprovedARAStar: def __init__(self, threat_map, max_turn_angle=45): self.threat_map = threat_map self.width = threat_map.width self.height = threat_map.height self.resolution = threat_map.resolution self.max_turn_angle = max_turn_angle # in degrees self.k = 0.2 # Threat penalty coefficient self.epsilon = 2.0 # Initial epsilon def heuristic(self, a, b): # Euclidean distance return math.sqrt((a[0]-b[0])**2 + (a[1]-b[1])**2) * self.resolution def get_neighbors(self, node, parent): neighbors = [] # 8-connected grid for dx in [-1, 0, 1]: for dy in [-1, 0, 1]: if dx == 0 and dy == 0: continue x, y = node[0] + dx, node[1] + dy # Check bounds if x < 0 or x >= self.width or y < 0 or y >= self.height: continue # Check turn angle constraint if parent exists if parent: px, py = parent # Vector from parent to current node vec1 = (node[0] - px, node[1] - py) # Vector from current node to neighbor vec2 = (x - node[0], y - node[1]) # Calculate angle between vectors dot = vec1[0]*vec2[0] + vec1[1]*vec2[1] mag1 = math.sqrt(vec1[0]**2 + vec1[1]**2) mag2 = math.sqrt(vec2[0]**2 + vec2[1]**2) if mag1 > 0 and mag2 > 0: cos_angle = dot / (mag1 * mag2) angle = math.degrees(math.acos(max(min(cos_angle, 1), -1))) if angle > self.max_turn_angle: continue neighbors.append((x, y)) return neighbors def path_cost(self, a, b): # Euclidean distance dx = (a[0]-b[0]) * self.resolution dy = (a[1]-b[1]) * self.resolution return math.sqrt(dx**2 + dy**2) def plan(self, start, goal, max_time=60): OPEN = [] CLOSED = {} INCONS = {} # Initialize start node g_score = {start: 0} h_score = self.heuristic(start, goal) T_score = self.threat_map.combined_threat[start[0], start[1]] epsilon_s = self.epsilon * (1 + self.k * T_score) f_score = {start: g_score[start] + epsilon_s * h_score} heapq.heappush(OPEN, (f_score[start], start)) parent = {start: None} start_time = time.time() path_found = False while OPEN and time.time() - start_time < max_time: _, current = heapq.heappop(OPEN) if current == goal: path_found = True break CLOSED[current] = g_score[current] neighbors = self.get_neighbors(current, parent.get(current)) for neighbor in neighbors: # Calculate tentative g_score tentative_g = g_score[current] + self.path_cost(current, neighbor) # Get threat value for neighbor T_neighbor = self.threat_map.combined_threat[neighbor[0], neighbor[1]] epsilon_s = self.epsilon * (1 + self.k * T_neighbor) if neighbor in CLOSED and tentative_g >= CLOSED.get(neighbor, float('inf')): continue if tentative_g < g_score.get(neighbor, float('inf')): parent[neighbor] = current g_score[neighbor] = tentative_g h_val = self.heuristic(neighbor, goal) f_score[neighbor] = tentative_g + epsilon_s * h_val if neighbor in CLOSED: INCONS[neighbor] = g_score[neighbor] else: heapq.heappush(OPEN, (f_score[neighbor], neighbor)) # Reconstruct path path = [] if path_found: current = goal while current is not None: path.append(current) current = parent.get(current) path.reverse() # Calculate path metrics path_length = 0 threat_value = 0 if len(path) > 1: for i in range(1, len(path)): path_length += self.path_cost(path[i-1], path[i]) threat_value += self.threat_map.combined_threat[path[i][0], path[i][1]] threat_value /= len(path) return path, path_length, threat_value, time.time() - start_time def plot_path(self, path, title="Path Planning Result"): plt.figure(figsize=(12, 10)) plt.imshow(self.threat_map.combined_threat.T, origin='lower', cmap='hot') # Plot radars for radar in self.threat_map.radars: x, y = radar['pos'] plt.scatter(x, y, s=100, c='blue', marker='^', label='Radar') # Plot fires for fire in self.threat_map.fires: x, y = fire['pos'] plt.scatter(x, y, s=100, c='red', marker='o', label='Fire') # Plot path if path: path_x = [p[0] for p in path] path_y = [p[1] for p in path] plt.plot(path_x, path_y, 'g-', linewidth=2, label='Planned Path') plt.scatter(path_x[0], path_y[0], s=150, c='green', marker='o', label='Start') plt.scatter(path_x[-1], path_y[-1], s=150, c='purple', marker='*', label='Goal') plt.colorbar(label='Threat Level') plt.title(title) plt.xlabel('X (grid cells)') plt.ylabel('Y (grid cells)') plt.legend() plt.show() # Simulation and Experiment def run_simulation(): # Create threat map width, height = 100, 100 threat_map = ThreatMap(width, height, resolution=100) threat_map.generate_terrain('mountain') # Add threats (radars and fires) threat_map.add_radar(20, 80, 45, 5000) threat_map.add_radar(60, 40, 180, 4000) threat_map.add_radar(80, 70, 270, 6000) threat_map.add_fire(30, 30, 1000, 3000) threat_map.add_fire(50, 60, 1500, 4000) threat_map.add_fire(70, 20, 2000, 3500) # Calculate threats threat_map.calculate_threats(z=500) threat_map.combine_threats() # Plot threat map threat_map.plot_threat_map() # Create planner planner = ImprovedARAStar(threat_map) # Define scenarios scenarios = { 'Easy': {'start': (10, 90), 'goal': (90, 10), 'max_time': 60}, 'Medium': {'start': (5, 5), 'goal': (95, 95), 'max_time': 120}, 'Hard': {'start': (10, 50), 'goal': (90, 50), 'max_time': 300} } results = [] # Run simulations for name, params in scenarios.items(): print(f"\nRunning {name} scenario...") start, goal = params['start'], params['goal'] # Plan path path, length, threat, time_used = planner.plan(start, goal, params['max_time']) if path: print(f"Path found! Length: {length:.2f}m, Avg Threat: {threat:.4f}, Time: {time_used:.2f}s") planner.plot_path(path, title=f"{name} Scenario Path Planning") else: print("No path found!") results.append({ 'scenario': name, 'success': bool(path), 'path_length': length, 'avg_threat': threat, 'time': time_used }) # Print results print("\nSimulation Results:") print("{:<10} {:<10} {:<15} {:<15} {:<10}".format( "Scenario", "Success", "Path Length(m)", "Avg Threat", "Time(s)")) for res in results: print("{:<10} {:<10} {:<15.2f} {:<15.4f} {:<10.2f}".format( res['scenario'], str(res['success']), res['path_length'], res['avg_threat'], res['time'])) # Additional experiments (ablation study) print("\nRunning ablation study...") # Remove fire threats print("\nWithout fire threats:") threat_map_no_fire = ThreatMap(width, height, resolution=100) threat_map_no_fire.terrain = threat_map.terrain.copy() threat_map_no_fire.radars = threat_map.radars.copy() threat_map_no_fire.calculate_threats(z=500) threat_map_no_fire.combine_threats() planner_no_fire = ImprovedARAStar(threat_map_no_fire) path, length, threat, time_used = planner_no_fire.plan(scenarios['Medium']['start'], scenarios['Medium']['goal'], 120) planner_no_fire.plot_path(path, "Path Planning Without Fire Threats") # Remove radar threats print("\nWithout radar threats:") threat_map_no_radar = ThreatMap(width, height, resolution=100) threat_map_no_radar.terrain = threat_map.terrain.copy() threat_map_no_radar.fires = threat_map.fires.copy() threat_map_no_radar.calculate_threats(z=500) threat_map_no_radar.combine_threats() planner_no_radar = ImprovedARAStar(threat_map_no_radar) path, length, threat, time_used = planner_no_radar.plan(scenarios['Medium']['start'], scenarios['Medium']['goal'], 120) planner_no_radar.plot_path(path, "Path Planning Without Radar Threats") # Remove terrain threats print("\nWithout terrain threats:") threat_map_no_terrain = ThreatMap(width, height, resolution=100) threat_map_no_terrain.radars = threat_map.radars.copy() threat_map_no_terrain.fires = threat_map.fires.copy() threat_map_no_terrain.calculate_threats(z=500) threat_map_no_terrain.combine_threats() planner_no_terrain = ImprovedARAStar(threat_map_no_terrain) path, length, threat, time_used = planner_no_terrain.plan(scenarios['Medium']['start'], scenarios['Medium']['goal'], 120) planner_no_terrain.plot_path(path, "Path Planning Without Terrain Threats") if __name__ == "__main__": run_simulation() 生成的航迹为直线,且寻优时间总为0
最新发布
06-12
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值