详解第20行代码:return sorted(filtered_versions, key=lambda v: list(map(int, v.split(‘.‘))))

🔎 第20行详解

return sorted(filtered_versions, key=lambda v: list(map(int, v.split('.'))))

🚨 拆解与解释

该行代码的作用是将 filtered_versions 中的版本号按正确的版本号排序规则进行升序排列。


🔍 Step 1:sorted() 函数

  • sorted() 是 Python 的内置函数,用于对可迭代对象(如列表、元组、字典等)进行排序。
  • 语法:
sorted(iterable, key=None, reverse=False)
参数作用
iterable需要排序的对象 (如 list, tuple, dict 等)
key指定排序的规则(函数)
reverse若为 True,则按降序排序(默认为升序)

🔍 Step 2:key=lambda v: list(map(int, v.split('.')))

该部分是 sorted()key 参数,指定了排序的依据。

  • lambda v: ...:匿名函数,v 表示 filtered_versions 中的每个版本号。
  • v.split('.')
    • 将版本号字符串按 . 分割成列表。
    • 示例:"1.2.10"["1", "2", "10"]
  • map(int, ...)
    • split() 生成的字符串列表转换为整数列表,以确保数字的正确比较。
    • 示例:map(int, ["1", "2", "10"])[1, 2, 10]
  • list(...):将 map 对象转换为一个具体的 list

示例:

print(list(map(int, "1.2.10".split('.'))))  
# 输出:[1, 2, 10]

🔍 Step 3:完整流程

filtered_versions = ["1.2.3", "1.2.10", "2.0.0", "1.0.5"] 为例:

原始数据

filtered_versions = ["1.2.3", "1.2.10", "2.0.0", "1.0.5"]

排序过程

sorted_versions = sorted(filtered_versions, key=lambda v: list(map(int, v.split('.'))))

🔹 key 参数的具体转换过程:

版本号转换结果
"1.2.3"[1, 2, 3]
"1.2.10"[1, 2, 10]
"2.0.0"[2, 0, 0]
"1.0.5"[1, 0, 5]

排序结果

['1.0.5', '1.2.3', '1.2.10', '2.0.0']

❗️ 为什么不能直接使用 sorted(filtered_versions)

如果仅使用 sorted(),它会将版本号作为字符串来比较,结果会错误地将 "1.2.10" 排在 "1.2.3" 之前。

['1.0.5', '1.2.10', '1.2.3', '2.0.0'](错误)
['1.0.5', '1.2.3', '1.2.10', '2.0.0'](正确)

map(int, v.split('.')) 保证每一部分都按数值正确排序,而非按字符串字典序排序。


总结

第20行代码的核心逻辑:

🔹 sorted():对 filtered_versions 进行排序
🔹 key=lambda v: list(map(int, v.split('.'))):将版本号转换为数值列表,确保排序正确
🔹 结果为按版本号正确排序的列表

import tkinter as tk from tkinter import ttk, filedialog import ttkbootstrap as ttkb from ttkbootstrap.constants import * import random import time import threading class DeviceSearchDialog(ttkb.Toplevel): """设备搜索浮层窗口""" def __init__(self, parent, callback): super().__init__(parent) self.title("设备连接") self.geometry("600x500") self.resizable(False, False) self.callback = callback # 选择设备后的回调函数 # 设置窗口位于父窗口中心 self.transient(parent) self.grab_set() # 创建主框架 main_frame = ttkb.Frame(self) main_frame.pack(fill=BOTH, expand=True, padx=10, pady=10) # 左侧设备系列面板 series_frame = ttkb.LabelFrame(main_frame, text="设备系列", padding=10) series_frame.pack(side=LEFT, fill=Y, padx=(0, 10)) # 搜索框 search_frame = ttkb.Frame(series_frame) search_frame.pack(fill=X, pady=(0, 10)) ttkb.Label(search_frame, text="搜索:").pack(side=LEFT, padx=(0, 5)) self.search_var = tk.StringVar() search_entry = ttkb.Entry( search_frame, textvariable=self.search_var, width=20 ) search_entry.pack(side=LEFT, fill=X, expand=True, padx=(0, 5)) search_entry.bind("<KeyRelease>", self.filter_devices) # 设备系列列表 series_list_frame = ttkb.Frame(series_frame) series_list_frame.pack(fill=BOTH, expand=True) self.series_list = ttkb.Treeview( series_list_frame, columns=("count"), show="tree", height=15 ) self.series_list.pack(fill=BOTH, expand=True) # 添加滚动条 scrollbar = ttkb.Scrollbar(series_list_frame, command=self.series_list.yview) scrollbar.pack(side=RIGHT, fill=Y) self.series_list.config(yscrollcommand=scrollbar.set) # 添加系列 series = ["Pro系列", "数字系列", "眼镜系列", "运动系列", "降噪系列"] for s in series: self.series_list.insert("", "end", text=s, values=(f"({random.randint(5, 20)}款)")) # 绑定选择事件 self.series_list.bind("<<TreeviewSelect>>", self.on_series_select) # 右侧设备列表面板 devices_frame = ttkb.LabelFrame(main_frame, text="设备列表", padding=10) devices_frame.pack(side=RIGHT, fill=BOTH, expand=True) # 设备列表 self.devices_list = ttkb.Treeview( devices_frame, columns=("status"), show="headings", height=15 ) self.devices_list.pack(fill=BOTH, expand=True) # 设置列 self.devices_list.heading("#0", text="设备名称") self.devices_list.heading("status", text="状态") self.devices_list.column("#0", width=250, anchor=W) self.devices_list.column("status", width=100, anchor=CENTER) # 添加滚动条 scrollbar_devices = ttkb.Scrollbar(devices_frame, command=self.devices_list.yview) scrollbar_devices.pack(side=RIGHT, fill=Y) self.devices_list.config(yscrollcommand=scrollbar_devices.set) # 绑定设备选择事件 self.devices_list.bind("<<TreeviewSelect>>", self.on_device_select) # 底部按钮 button_frame = ttkb.Frame(main_frame) button_frame.pack(fill=X, pady=10) self.connect_btn = ttkb.Button( button_frame, text="开始连接", bootstyle=SUCCESS, state=DISABLED, width=15, command=self.connect_device ) self.connect_btn.pack(side=RIGHT, padx=(10, 0)) ttkb.Button( button_frame, text="取消", bootstyle=SECONDARY, width=10, command=self.destroy ).pack(side=RIGHT) # 初始化设备数据 self.device_data = self.generate_devices() self.filtered_devices = [] # 默认显示所有设备 self.display_devices(self.device_data) def generate_devices(self): """生成示例设备数据""" series_map = { "Pro系列": ["Pro 1", "Pro 2 Lite", "Pro Max", "Pro Ultra"], "数字系列": ["100", "200", "300", "400", "500"], "眼镜系列": ["Smart Glass X", "AR Vision Pro", "VR Headset 2023"], "运动系列": ["RunFit", "SwimPro", "Cycling Buds"], "降噪系列": ["NoiseFree 1", "NoiseFree Pro", "SilenceMax"] } statuses = ["在线", "离线", "待机"] devices = [] for series, models in series_map.items(): for model in models: device = { "name": f"{series} {model}", "series": series, "status": random.choice(statuses) } devices.append(device) return devices def display_devices(self, devices): """在设备列表中显示设备""" self.devices_list.delete(*self.devices_list.get_children()) self.filtered_devices = devices for device in devices: status_color = SUCCESS if device["status"] == "在线" else ( WARNING if device["status"] == "待机" else DANGER ) self.devices_list.insert( "", END, text=device["name"], values=(device["status"],), tags=(status_color,) ) self.devices_list.tag_configure(SUCCESS, foreground=ttkb.Style().colors.get(SUCCESS)) self.devices_list.tag_configure(WARNING, foreground=ttkb.Style().colors.get(WARNING)) self.devices_list.tag_configure(DANGER, foreground=ttkb.Style().colors.get(DANGER)) def filter_devices(self, event=None): """根据搜索框内容过滤设备""" search_term = self.search_var.get().lower() if not search_term: # 如果没有搜索词,显示当前系列的所有设备 selected = self.series_list.selection() if selected: series = self.series_list.item(selected[0], "text") self.display_devices([d for d in self.device_data if d["series"] == series]) else: self.display_devices(self.device_data) return # 过滤设备 filtered = [d for d in self.device_data if search_term in d["name"].lower()] self.display_devices(filtered) def on_series_select(self, event): """选择系列时过滤设备""" selected = self.series_list.selection() if not selected: return series = self.series_list.item(selected[0], "text") self.display_devices([d for d in self.device_data if d["series"] == series]) def on_device_select(self, event): """选择设备时启用连接按钮""" selected = self.devices_list.selection() self.connect_btn.config(state=NORMAL if selected else DISABLED) def connect_device(self): """连接选中的设备""" selected = self.devices_list.selection() if not selected: return device_name = self.devices_list.item(selected[0], "text") device = next((d for d in self.filtered_devices if d["name"] == device_name), None) if device and self.callback: self.callback(device) self.destroy() class MainApplication(ttkb.Window): """主应用程序窗口""" def __init__(self): super().__init__(themename="vapor") self.title("音频设备刷机工具") self.geometry("1200x700") # 创建主框架 main_frame = ttkb.Frame(self) main_frame.pack(fill=BOTH, expand=True) # 创建左右分割面板 paned_window = ttkb.PanedWindow(main_frame, orient=HORIZONTAL) paned_window.pack(fill=BOTH, expand=True) # 左侧面板 - 设备列表 left_panel = ttkb.Frame(paned_window, padding=10) paned_window.add(left_panel, weight=1) # 右侧面板 - 设备详情 right_panel = ttkb.Frame(paned_window, padding=10) paned_window.add(right_panel, weight=3) # 构建左侧面板内容 self.build_left_panel(left_panel) # 构建右侧面板内容 self.build_right_panel(right_panel) # 当前连接状态 self.connected = False self.device_info = {} # 设备版本选项 self.firmware_versions = ["v1.0.1", "v1.0.2", "v1.1.0", "v2.0.0", "v2.1.3"] # 初始化状态 self.update_connection_status("未连接") self.set_ui_state(False) def build_left_panel(self, parent): """构建左侧设备列表面板""" # 设备列表标签页 notebook = ttkb.Notebook(parent) notebook.pack(fill=BOTH, expand=True) # 设备列表标签页 device_frame = ttkb.Frame(notebook) notebook.add(device_frame, text="设备列表") # 搜索区域 search_frame = ttkb.Frame(device_frame) search_frame.pack(fill=X, pady=(0, 10)) ttkb.Label(search_frame, text="设备搜索:").pack(side=LEFT, padx=(0, 5)) self.search_entry = ttkb.Entry( search_frame, width=20 ) self.search_entry.pack(side=LEFT, fill=X, expand=True, padx=(0, 5)) search_btn = ttkb.Button( search_frame, text="搜索", bootstyle=PRIMARY, command=self.open_device_search ) search_btn.pack(side=LEFT) # 设备列表 list_frame = ttkb.Frame(device_frame) list_frame.pack(fill=BOTH, expand=True) # 创建带滚动条的树状视图 scrollbar = ttkb.Scrollbar(list_frame) scrollbar.pack(side=RIGHT, fill=Y) self.device_tree = ttkb.Treeview( list_frame, columns=("status", "version"), show="headings", yscrollcommand=scrollbar.set, height=15 ) self.device_tree.pack(fill=BOTH, expand=True) scrollbar.config(command=self.device_tree.yview) # 设置列 self.device_tree.heading("#0", text="设备名称") self.device_tree.heading("status", text="状态") self.device_tree.heading("version", text="固件版本") self.device_tree.column("#0", width=150, anchor=W) self.device_tree.column("status", width=80, anchor=CENTER) self.device_tree.column("version", width=100, anchor=CENTER) # 添加示例设备 self.add_sample_devices() # 绑定选择事件 self.device_tree.bind("<<TreeviewSelect>>", self.on_device_select) # 底部搜索按钮 btn_frame = ttkb.Frame(device_frame) btn_frame.pack(fill=X, pady=10) ttkb.Button( btn_frame, text="搜索设备", bootstyle=PRIMARY, width=15, command=self.open_device_search ).pack(anchor=CENTER) def build_right_panel(self, parent): """构建右侧设备详情面板""" # 第一部分 - 标题和版本 title_frame = ttkb.Frame(parent) title_frame.pack(fill=X, pady=(0, 15)) ttkb.Label( title_frame, text="音频设备刷机工具", font=("微软雅黑", 16, "bold") ).pack(side=LEFT) ttkb.Label( title_frame, text="当前版本:v1.0", bootstyle=SECONDARY ).pack(side=RIGHT) # 第二部分 - 设备状态和产品信息 status_frame = ttkb.LabelFrame(parent, text="设备状态", padding=10) status_frame.pack(fill=X, pady=(0, 15)) # 连接状态 conn_frame = ttkb.Frame(status_frame) conn_frame.pack(fill=X, pady=(0, 5)) ttkb.Label(conn_frame, text="连接状态:").pack(side=LEFT, padx=(0, 5)) self.conn_status = ttkb.Label( conn_frame, text="未连接", bootstyle=DANGER ) self.conn_status.pack(side=LEFT) # 产品信息 info_frame = ttkb.Frame(status_frame) info_frame.pack(fill=X, pady=5) ttkb.Label(info_frame, text="产品型号:").pack(side=LEFT, padx=(0, 5)) self.model_label = ttkb.Label(info_frame, text="N/A", bootstyle=SECONDARY) self.model_label.pack(side=LEFT, padx=(0, 20)) ttkb.Label(info_frame, text="产品名称:").pack(side=LEFT, padx=(0, 5)) self.name_label = ttkb.Label(info_frame, text="N/A", bootstyle=SECONDARY) self.name_label.pack(side=LEFT) # 第三部分 - 版本信息表格 version_frame = ttkb.LabelFrame(parent, text="设备版本信息", padding=10) version_frame.pack(fill=X, pady=(0, 15)) # 创建版本表格 columns = ["SN", "内部版本", "外部版本", "MCU版本"] self.version_tree = ttkb.Treeview( version_frame, columns=columns, show="headings", height=4 ) self.version_tree.pack(fill=X) # 设置列 self.version_tree.heading("#0", text="组件") for col in columns: self.version_tree.heading(col, text=col) self.version_tree.column(col, width=90, anchor=CENTER) self.version_tree.column("#0", width=80, anchor=CENTER) # 添加示例数据 components = ["充电盒", "左耳", "右耳"] for comp in components: self.version_tree.insert("", "end", text=comp, values=("N/A", "N/A", "N/A", "N/A")) # 第四部分 - 刷机工具 flash_frame = ttkb.LabelFrame(parent, text="刷机工具", padding=10) flash_frame.pack(fill=X, pady=(0, 15)) # 设备版本选择 ver_frame = ttkb.Frame(flash_frame) ver_frame.pack(fill=X, pady=5) ttkb.Label(ver_frame, text="设备版本:").pack(side=LEFT, padx=(0, 10)) self.firmware_var = tk.StringVar() firmware_combo = ttkb.Combobox( ver_frame, textvariable=self.firmware_var, values=self.firmware_versions, state="readonly", width=15 ) firmware_combo.pack(side=LEFT) firmware_combo.current(0) # 充电盒版本文件 case_frame = ttkb.Frame(flash_frame) case_frame.pack(fill=X, pady=5) ttkb.Label(case_frame, text="充电盒版本文件:").pack(side=LEFT, padx=(0, 10)) self.case_file_var = tk.StringVar() case_entry = ttkb.Entry( case_frame, textvariable=self.case_file_var, width=30 ) case_entry.pack(side=LEFT, fill=X, expand=True, padx=(0, 5)) case_btn = ttkb.Button( case_frame, text="浏览...", bootstyle=SECONDARY, command=lambda: self.select_file(self.case_file_var) ) case_btn.pack(side=LEFT) # 耳机版本文件 ear_frame = ttkb.Frame(flash_frame) ear_frame.pack(fill=X, pady=5) ttkb.Label(ear_frame, text="耳机版本文件:").pack(side=LEFT, padx=(0, 10)) self.ear_file_var = tk.StringVar() ear_entry = ttkb.Entry( ear_frame, textvariable=self.ear_file_var, width=30) ear_entry.pack(side=LEFT, fill=X, expand=True, padx=(0, 5)) ear_btn = ttkb.Button( ear_frame, text="浏览...", bootstyle=SECONDARY, command=lambda: self.select_file(self.ear_file_var) ) ear_btn.pack(side=LEFT) # 日志级别选择 log_frame = ttkb.Frame(flash_frame) log_frame.pack(fill=X, pady=5) ttkb.Label(log_frame, text="日志级别:").pack(side=LEFT, padx=(0, 10)) self.log_level_var = tk.StringVar(value="INFO") log_levels = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] for level in log_levels: ttkb.Radiobutton( log_frame, text=level, variable=self.log_level_var, value=level, bootstyle=PRIMARY ).pack(side=LEFT, padx=(0, 10)) # 刷机按钮 flash_btn = ttkb.Button( flash_frame, text="开始刷机", bootstyle=SUCCESS, command=self.start_flashing ) flash_btn.pack(pady=10, anchor=CENTER) # 第五部分 - 刷机进度 progress_frame = ttkb.LabelFrame(parent, text="刷机进度", padding=10) progress_frame.pack(fill=X, pady=(0, 15)) # 进度条 self.progress_bar = ttkb.Progressbar( progress_frame, orient=HORIZONTAL, mode="determinate", bootstyle=SUCCESS ) self.progress_bar.pack(fill=X, pady=5) # 进度标签 self.progress_label = ttkb.Label( progress_frame, text="等待开始...", bootstyle=SECONDARY ) self.progress_label.pack(fill=X, pady=(0, 5)) # 第六部分 - 日志输出 log_output_frame = ttkb.LabelFrame(parent, text="日志输出", padding=10) log_output_frame.pack(fill=BOTH, expand=True) # 日志文本框 self.log_text = tk.Text( log_output_frame, wrap=WORD, height=8, state=DISABLED ) self.log_text.pack(fill=BOTH, expand=True) # 滚动条 scrollbar_log = ttkb.Scrollbar( log_output_frame, command=self.log_text.yview ) scrollbar_log.pack(side=RIGHT, fill=Y) self.log_text.config(yscrollcommand=scrollbar_log.set) def select_file(self, file_var): """打开文件对话框选择文件""" file_path = filedialog.askopenfilename( filetypes=[("固件文件", "*.bin *.hex"), ("所有文件", "*.*")] ) if file_path: file_var.set(file_path) def start_flashing(self): """开始刷机过程""" if not self.connected: self.log_message("错误:请先连接设备!", "ERROR") return # 检查文件路径 case_file = self.case_file_var.get() ear_file = self.ear_file_var.get() if not case_file and not ear_file: self.log_message("错误:请至少选择一个固件文件!", "ERROR") return # 更新UI状态 self.set_ui_state(False) self.progress_bar["value"] = 0 self.progress_label.config(text="准备中...") # 模拟刷机过程(实际应用中应使用线程) threading.Thread(target=self.simulate_flashing, args=(case_file, ear_file)).start() def simulate_flashing(self, case_file, ear_file): """模拟刷机过程(实际应用中应替换为真实逻辑)""" steps = [ "初始化设备...", "验证固件文件...", "更新充电盒固件..." if case_file else "跳过充电盒更新", "更新耳机固件..." if ear_file else "跳过耳机更新", "验证固件版本...", "刷机完成" ] total_steps = len(steps) step_duration = 1.5 # 每个步骤的持续时间(秒) for i, step in enumerate(steps): progress = (i + 1) / total_steps * 100 self.progress_bar["value"] = progress self.progress_label.config(text=step) self.log_message(f"[INFO] {step}") time.sleep(step_duration) # 完成后的处理 self.set_ui_state(True) self.log_message("[SUCCESS] 刷机已完成!设备已重新启动") self.progress_label.config(text="刷机完成!", bootstyle=SUCCESS) def log_message(self, message, level=None): """向日志框添加消息""" if not level: level = self.log_level_var.get() # 根据日志级别设置文本颜色 color_map = { "DEBUG": "#888888", "INFO": "#000000", "WARNING": "#FF9900", "ERROR": "#FF0000", "CRITICAL": "#FF0000" } tag = level.lower() self.log_text.config(state=NORMAL) # 添加时间戳 timestamp = time.strftime("%H:%M:%S") full_message = f"[{timestamp}] {message}\n" # 插入带格式的文本 self.log_text.insert(END, full_message, tag) self.log_text.tag_config(tag, foreground=color_map.get(level, "#000000")) # 滚动到底部 self.log_text.see(END) self.log_text.config(state=DISABLED) def add_sample_devices(self): """添加示例设备到设备列表""" devices = [ {"name": "Pro Max", "status": "在线", "version": "v1.2.3"}, {"name": "Digital 300", "status": "离线", "version": "v1.0.5"}, {"name": "Smart Glass X", "status": "待机", "version": "v2.1.0"}, {"name": "RunFit Pro", "status": "在线", "version": "v1.5.2"}, {"name": "NoiseFree 1", "status": "离线", "version": "v1.3.7"} ] status_colors = { "在线": SUCCESS, "离线": DANGER, "待机": WARNING } for device in devices: self.device_tree.insert( "", END, text=device["name"], values=(device["status"], device["version"]), tags=(status_colors[device["status"]],) ) # 配置标签颜色 for status, style in status_colors.items(): self.device_tree.tag_configure(style, foreground=ttkb.Style().colors.get(style)) def on_device_select(self, event): """设备列表选择事件处理""" selected = self.device_tree.selection() if not selected: return item = self.device_tree.item(selected[0]) self.update_device_info(item["text"]) def update_device_info(self, device_name): """更新设备信息显示""" # 模拟设备信息(实际应用中应替换为真实数据) self.device_info = { "model": device_name.split()[0], "name": device_name, "status": "在线", "versions": { "充电盒": ["SN001", "v1.2.3", "v1.2", "v1.0"], "左耳": ["SN002", "v1.2.3", "v1.2", "v1.0"], "右耳": ["SN003", "v1.2.3", "v1.2", "v1.0"] } } # 更新UI self.model_label.config(text=self.device_info["model"]) self.name_label.config(text=self.device_info["name"]) self.update_connection_status("已连接") # 更新版本表格 for child in self.version_tree.get_children(): self.version_tree.delete(child) for component, versions in self.device_info["versions"].items(): self.version_tree.insert("", END, text=component, values=tuple(versions)) # 启用UI self.set_ui_state(True) def open_device_search(self): """打开设备搜索对话框""" DeviceSearchDialog(self, self.on_device_selected) def on_device_selected(self, device): """设备搜索对话框回调函数""" self.connected = True self.update_device_info(device["name"]) self.log_message(f"已连接设备: {device['name']}") def update_connection_status(self, status): """更新连接状态显示""" self.connected = (status == "已连接") self.conn_status.config( text=status, bootstyle=SUCCESS if self.connected else DANGER ) def set_ui_state(self, enabled): """设置UI组件的启用/禁用状态""" state = NORMAL if enabled else DISABLED self.firmware_var.set("" if not enabled else self.firmware_versions[0]) # 更新组件状态 for widget in [ self.firmware_var, self.case_file_var, self.ear_file_var, self.log_level_var, self.search_entry ]: if hasattr(widget, "_widget"): widget._widget.config(state=state) else: # 对于Combobox需要特殊处理 if isinstance(widget, tk.StringVar): for combo in self.winfo_children(): if isinstance(combo, ttkb.Combobox) and combo.cget("textvariable") == widget: combo.config(state=state) # 更新按钮状态 for btn in self.winfo_children(): if isinstance(btn, ttkb.Button) and btn.cget("text") in ["开始刷机", "搜索设备"]: btn.config(state=state) if __name__ == "__main__": app = MainApplication() app.mainloop() 代码报AttributeError: '_tkinter.tkapp' object has no attribute 'firmware_versions',修改后的代码全部输出出来
12-18
import sys import math from pyspark import SparkConf, SparkContext from pyspark.sql.session import SparkSession # ------------------------------------------------------------ # Scalable Spatio-Textual Similarity Join (RDD solution) # ------------------------------------------------------------ # Usage: # spark-submit project3.py <input_A_path> <input_B_path> <output_path> <d> <s> # # Design notes: # - Grid index with cell size = d. Each A record is replicated to its 8-neighborhood # (including its own cell); B records are assigned to their own cell only. Candidate # pairs are generated only within each cell. Exact Euclidean distance is checked. # - Textual filtering uses exact Jaccard with prefix filtering: # * Tokens are globally ordered by (frequency asc, token asc). # * For a record S with |S| terms and threshold t, prefix length is: # p(S) = |S| - ceil(t*|S|) + 1 (SIGMOD'10) # * We compute the prefixes of A and B and discard pairs whose prefixes do not intersect. # - Results are deduplicated and sorted by numeric IDs of (A,B). # - Output format: # (A<id>,B<id>):<distance_6dec>,<jaccard_6dec> # ------------------------------------------------------------ def parse_record(line): # Example line: A0#(1,1)#apple banana orange rid, coord, terms = line.strip().split("#") typ = rid[0] # 'A' or 'B' num = int(rid[1:]) # numeric id for sorting coord = coord.strip()[1:-1] # remove parentheses x_str, y_str = coord.split(",") x = float(x_str) y = float(y_str) # terms separated by single spaces; remove duplicates right here toks = list(dict.fromkeys(terms.strip().split())) # preserve original order while deduping return (typ, num, rid, x, y, toks) def prefix_len(n, t): # p = n - ceil(t*n) + 1 ; never negative p = n - int(math.ceil(t * n)) + 1 return max(p, 0) def euclid(ax, ay, bx, by): dx = ax - bx dy = ay - by return math.sqrt(dx*dx + dy*dy) class Project3: def run(self, inputA, inputB, output, d_str, s_str): d = float(d_str) s = float(s_str) conf = SparkConf().setAppName("Project3-SpatioTextualJoin") sc = SparkContext.getOrCreate(conf=conf) # --------------------- Load & Parse --------------------- rddA_raw = sc.textFile(inputA).map(parse_record) rddB_raw = sc.textFile(inputB).map(parse_record) # --------------------- Global token order --------------------- # Build token frequency over union of A and B tokensA = rddA_raw.flatMap(lambda r: [(t,1) for t in r[5]]) tokensB = rddB_raw.flatMap(lambda r: [(t,1) for t in r[5]]) token_freq = tokensA.union(tokensB).reduceByKey(lambda a,b: a+b) # Assign global order: (freq asc, token lex asc) -> rank order = ( token_freq .sortBy(lambda kv: (kv[1], kv[0])) # (token, freq) .zipWithIndex() # ((token, freq), idx) .map(lambda x: (x[0][0], int(x[1]))) # token -> rank .collectAsMap() ) order_bc = sc.broadcast(order) # Replace tokens with globally-ordered list and set versions def decorate_tokens(r): typ, num, rid, x, y, toks = r # sort tokens by global order rank ordmap = order_bc.value toks_sorted = sorted(toks, key=lambda t: (ordmap.get(t, 10**12), t)) toks_set = set(toks_sorted) prelen = prefix_len(len(toks_sorted), s) prefix = toks_sorted[:prelen] return (typ, num, rid, x, y, toks_sorted, toks_set, prefix) rddA = rddA_raw.map(decorate_tokens).persist() rddB = rddB_raw.map(decorate_tokens).persist() # --------------------- Grid index --------------------- cellSize = d def cell_of(x, y): return (int(math.floor(x / cellSize)), int(math.floor(y / cellSize))) def emit_A_cells(r): # replicate A to 3x3 neighborhood typ, num, rid, x, y, toks_sorted, toks_set, prefix = r gx, gy = cell_of(x, y) out = [] for dx in (-1, 0, 1): for dy in (-1, 0, 1): out.append(((gx+dx, gy+dy), ('A', (num, rid, x, y, toks_sorted, toks_set, prefix)))) return out def emit_B_cell(r): typ, num, rid, x, y, toks_sorted, toks_set, prefix = r gx, gy = cell_of(x, y) return ((gx, gy), ('B', (num, rid, x, y, toks_sorted, toks_set, prefix))) keyed = rddA.flatMap(emit_A_cells).union(rddB.map(emit_B_cell)) # --------------------- Candidate generation within each cell --------------------- def generate_pairs(iterable): Arec = [] Brec = [] for tag, rec in iterable: if tag == 'A': Arec.append(rec) else: Brec.append(rec) results = [] if not Arec or not Brec: return results # cross product with prefix filter for an, arid, ax, ay, atoks, aset, aprefix in Arec: aplen = len(aprefix) for bn, brid, bx, by, btoks, bset, bprefix in Brec: # prefix filter: intersect of the prefixes must be non-empty # Compute smaller prefix first if aplen == 0 and len(bprefix) == 0: pass # still possible (empty sets) -> will be filtered by jaccard below else: if aplen <= len(bprefix): small = aprefix big = set(bprefix) else: small = bprefix big = set(aprefix) inter_prefix = any(t in big for t in small) if not inter_prefix: continue # spatial check dist = euclid(ax, ay, bx, by) if dist > d: continue # exact jaccard inter = len(aset.intersection(bset)) if inter == 0: continue union = len(aset) + len(bset) - inter jacc = float(inter) / float(union) if union > 0 else 0.0 if jacc + 1e-12 < s: # small epsilon continue # keep results.append(((an, bn), (arid, brid, dist, jacc))) return results candidates = ( keyed.groupByKey() .flatMap(lambda kv: generate_pairs(kv[1])) ) # --------------------- Deduplicate and sort --------------------- # A record may appear in multiple cells; deduplicate by (AidNum, BidNum) unique_pairs = candidates.reduceByKey(lambda a,b: a) # any value; pairs are identical by definition # Sort by (AidNum asc, BidNum asc) sorted_pairs = unique_pairs.sortByKey(ascending=True) # --------------------- Formatting --------------------- def fmt(pair): (aidn, bidn), (aid, bid, dist, jacc) = pair return f"({aid},{bid}):{dist:.6f},{jacc:.6f}" output_rdd = sorted_pairs.map(fmt) # Save output_rdd.coalesce(1).saveAsTextFile(output) sc.stop() if __name__ == '__main__': if len(sys.argv) != 6: print("Wrong arguments") sys.exit(-1) Project3().run(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5])这段代码什么意思
08-10
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值