#!/usr/bin/env python3
import argparse
import sys
import pandas as pd
import csv
import seaborn as sns
import matplotlib.pyplot as plt
import os
import subprocess
import shlex
import io
import tempfile
from typing import Union, TextIO, Optional
def parse_columns(cols) -> list[int]:
cols_str = ','.join(cols)
cols_str = cols_str.replace(' ', '')
parts = cols_str.split(',')
indices = []
for part in parts:
if not part:
continue
if '-' in part:
range_parts = part.split('-')
if len(range_parts) != 2:
raise ValueError(f"Invalid range: '{part}' (example: 1-3)")
start_str, end_str = range_parts
try:
start = int(start_str)
end = int(end_str)
except ValueError:
raise ValueError(f"Invalid range: '{part}'")
if start > end:
raise ValueError(f"Invalid range: '{part}' (must be start < end, e.g., 3-5, not 5-3)")
if start < 0:
raise ValueError(f"Invalid range: '{part}' (negative start)")
indices.extend(range(start, end + 1))
else:
try:
idx = int(part)
except ValueError:
raise ValueError(f"Invalid index: '{part}' (must be an integer)")
if idx < 0:
raise ValueError(f"Invalid index: '{part}' (negative)")
indices.append(idx)
return indices
def maximize_plot_window():
try:
manager = plt.get_current_fig_manager()
backend = plt.get_backend().lower()
if 'qt5agg' in backend or 'qt4agg' in backend:
manager.window.showMaximized()
else:
manager.full_screen_toggle()
except Exception as e:
print(f"[DEBUG] Could not maximize window ({backend}): {e}", file=sys.stderr)
def generate_awk_command(col_indices: list[int], preprocess: Optional[str] = None) -> list[str]:
"""Generate awk command to select specific columns"""
if not col_indices:
return ['cat']
col_indices = [i + 1 for i in col_indices]
# 使用制表符连接字段
fields = [f'${i}' for i in col_indices]
fields_str = '"\\t"'.join(fields)
awk_cmd = []
if preprocess:
awk_cmd.append(f'{{{preprocess}; print {fields_str}}}')
else:
awk_cmd.append(f'{{print {fields_str}}}')
return ['awk'] + awk_cmd
def apply_column_filter(file_path: str, col_indices: list[int], debug: bool = False, preprocess: Optional[str] = None) -> str:
"""Apply column filter using external command (awk)"""
cmd = generate_awk_command(col_indices, preprocess)
cmd_str = ' '.join(shlex.quote(str(x)) for x in cmd)
if debug:
print(f"[DEBUG] Running column filter: {cmd_str}", file=sys.stderr)
try:
result = subprocess.run(
cmd + [file_path],
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
return result.stdout
except subprocess.CalledProcessError as e:
print(f"Error running column filter: {e.stderr}", file=sys.stderr)
raise
def smart_read_csv_with_filters(
file_input: Union[str, TextIO],
args,
col_indices: Optional[list[int]] = None,
sep: str = '[ \t,;]+',
debug: bool = False
) -> pd.DataFrame:
drop_start = args.drop_start
keep_start = args.keep_start
drop_end = args.drop_end
keep_end = args.keep_end
preprocess = args.awk
# 判断是否是管道输入
is_pipe = isinstance(file_input, io.TextIOWrapper)
# 对于管道输入和文件输入,都先尝试用外部命令过滤列
filtered_data = None
if col_indices and (is_pipe or isinstance(file_input, str)):
try:
if is_pipe:
# 对于管道输入,先读取所有内容到内存
pipe_content = file_input.read()
# 创建一个临时文件来存储管道内容
with tempfile.NamedTemporaryFile(mode='w+', delete=True) as tmp:
tmp.write(pipe_content)
tmp.flush()
filtered_data = apply_column_filter(tmp.name, col_indices, debug, preprocess)
else:
# 对于文件输入,直接处理
filtered_data = apply_column_filter(file_input, col_indices, debug, preprocess)
# 将过滤后的数据转换为StringIO对象供pandas读取
file_input = io.StringIO(filtered_data)
except Exception as e:
if debug:
print(f"[DEBUG] Column filter failed, falling back to pandas: {e}", file=sys.stderr)
if is_pipe:
# 如果过滤失败,需要重置文件指针
file_input.seek(0)
# Step 2: Apply row filters
if not is_pipe and isinstance(file_input, str) and filtered_data is None:
file_path = file_input
cmd = ['cat', file_path]
applied = False
if keep_end is not None:
cmd = ['tail', '-n', str(keep_end), file_path]
applied = True
elif drop_end is not None:
try:
cmd = ['head', '-n', f'-{drop_end}', file_path]
applied = True
except:
try:
wc_process = subprocess.run(['wc', '-l', file_path], capture_output=True, text=True)
total_lines = int(wc_process.stdout.strip().split()[0])
start_line = total_lines - drop_end
if start_line < 1:
start_line = 1
cmd = ['tail', '-n', f'+{start_line}', file_path]
applied = True
except Exception as fallback_e:
if debug:
print(f"[DEBUG] Fallback failed: {fallback_e}", file=sys.stderr)
elif keep_start is not None:
cmd = ['head', '-n', str(keep_start), file_path]
applied = True
elif drop_start is not None:
start_line = drop_start + 1
if start_line < 1:
start_line = 1
cmd = ['tail', '-n', f'+{start_line}', file_path]
applied = True
if applied:
if debug:
print(f"[DEBUG] Running: {' '.join(shlex.quote(str(x)) for x in cmd)}", file=sys.stderr)
try:
res = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
csv_data = res.stdout
df = pd.read_csv(
io.StringIO(csv_data),
header=0 if args.header_row else None,
engine='python',
sep=sep,
skip_blank_lines=True,
skipinitialspace=True
)
return df
except Exception as sub_e:
if debug:
print(f"[DEBUG] External command failed: {sub_e}, falling back to pandas", file=sys.stderr)
# Fallback to pandas
try:
df = pd.read_csv(
file_input,
header=0 if args.header_row else None,
engine='python',
sep=sep,
skip_blank_lines=True,
skipinitialspace=True,
on_bad_lines='warn',
quoting=csv.QUOTE_NONE,
skiprows=drop_start if drop_start else 0,
skipfooter=drop_end if drop_end else 0
)
except Exception as e:
print(f"Error reading input data: {e}", file=sys.stderr)
sys.exit(1)
# Apply column filter if not already done externally
if col_indices and not filtered_data:
max_idx = df.shape[1] - 1
for idx in col_indices:
if idx < 0 or idx > max_idx:
print(f"Error: Column index {idx + 1} is out of range. Valid: 1-{max_idx + 1}.", file=sys.stderr)
sys.exit(1)
seen = set()
unique_cols = []
for idx in col_indices:
if idx not in seen:
seen.add(idx)
unique_cols.append(idx)
df = df.iloc[:, unique_cols]
if df.empty:
print("Error: No data left after column filtering.", file=sys.stderr)
sys.exit(1)
original_rows = len(df)
if keep_start is not None:
if keep_start < 0 or keep_start >= original_rows:
print(f"Error: --keep-start {keep_start} out of range [0, {original_rows})", file=sys.stderr)
sys.exit(1)
df = df.iloc[:keep_start]
if keep_end is not None:
if keep_end < 0 or keep_end >= original_rows:
print(f"Error: --keep-end {keep_end} out of range [0, {original_rows})", file=sys.stderr)
sys.exit(1)
df = df.iloc[-keep_end:] if keep_end > 0 else df
if len(df) == 0:
print("Error: No data rows remaining after filtering.", file=sys.stderr)
sys.exit(1)
return df
def main():
SUPPORTED_CHART_TYPES = {
'scatter', 'line', 'bar', 'hist', 'box', 'violin', 'heatmap'
}
parser = argparse.ArgumentParser(
description='Plot data from CSV/TSV or stdin with flexible column/axis configuration',
formatter_class=argparse.RawTextHelpFormatter,
epilog='Examples:\n'
' seq 1 10 | v -t line\n'
' echo -e "X,Y1,Y2\\nX1,2,3\\nX2,5,6" | v -H -t bar -T "My Bar Chart"\n'
' v data.csv 1,3 5-7 -t scatter -X "Time" -Y "Value"\n'
'Pro Tip: For very large files, use -S/-s/-E/-e for efficient row filtering.\n'
' These use native head/tail commands when possible.'
)
parser.add_argument('-t', '--chart-type', default='scatter',
help=f'Chart type: {", ".join(SUPPORTED_CHART_TYPES)}. Default: scatter')
parser.add_argument('-i', '--index-as-x', default=False, action='store_true',
help='Index as X axis')
parser.add_argument('-H', '--header-row', default=False, action='store_true',
help='Treat the first row as header')
parser.add_argument('-p', '--print', default=False, action='store_true',
help='Print filtered data (TSV) instead of plotting')
parser.add_argument('-d', '--debug', default=False, action='store_true',
help='Enable debug output')
parser.add_argument('-X', '--x-label', type=str, default=None,
help='Label for X-axis')
parser.add_argument('-Y', '--y-label', type=str, default=None,
help='Label for Y-axis')
parser.add_argument('-T', '--title', type=str, default=None,
help='Title of the plot')
parser.add_argument('-L', '--legend-labels', nargs='*', type=str, default=None,
help='Custom legend labels (must match number of legend entries)')
parser.add_argument('-S', '--drop-start', type=int, default=None,
help='Drop first N rows (remove starting rows)')
parser.add_argument('-s', '--keep-start', type=int, default=None,
help='Keep only first N rows (discard others)')
parser.add_argument('-E', '--drop-end', type=int, default=None,
help='Drop last N rows (remove ending rows)')
parser.add_argument('-e', '--keep-end', type=int, default=None,
help='Keep only last N rows (discard others)')
parser.add_argument('-a', '--awk', type=str, default=None,
help='AWK preprocessing command to apply to each line before column selection\n'
'Example: -a "$1/=1000" to change the first column value')
args, remaining_argv = parser.parse_known_args()
has_pipe = not sys.stdin.isatty()
if not has_pipe and len(remaining_argv) == 0:
parser.print_help()
sys.exit(1)
file_arg = sys.stdin if has_pipe else remaining_argv[0]
cols_args = remaining_argv if has_pipe else remaining_argv[1:]
if args.debug:
print(f"[DEBUG] Remaining args: {remaining_argv}", file=sys.stderr)
if args.chart_type not in SUPPORTED_CHART_TYPES:
print(f"Error: Unsupported chart type '{args.chart_type}'. Supported: {', '.join(sorted(SUPPORTED_CHART_TYPES))}", file=sys.stderr)
sys.exit(1)
# Parse column indices first
col_indices = None
if cols_args:
try:
col_indices = parse_columns(cols_args)
col_indices = [x - 1 for x in col_indices] # Convert to 0-based
except ValueError as ve:
print(f"Error in column indices: {ve}", file=sys.stderr)
sys.exit(1)
df = smart_read_csv_with_filters(file_arg, args, col_indices=col_indices, sep='[ \t,;]+', debug=args.debug)
if not args.header_row:
df.columns = [f"Column {i + 1}" for i in range(len(df.columns))]
if args.print:
for line in df.to_csv(sep='\t', index=False, header=False).splitlines():
# 检查是否是非空白行(包含非空白字符)
if line.strip():
print(line)
sys.exit(0)
if args.index_as_x or len(df.columns) == 1:
df.insert(0, 'index', df.index)
args.index_col = 0
num_columns = df.shape[1]
if args.index_col < 0 or args.index_col >= num_columns:
print(f"Error: --index-col {args.index_col} is out of range. Valid: 0-{num_columns - 1}", file=sys.stderr)
sys.exit(1)
x_axis_col = df.columns[args.index_col]
value_vars = [col for idx, col in enumerate(df.columns) if idx != args.index_col]
id_vars = [x_axis_col]
if not value_vars:
print("Error: No data columns to plot.", file=sys.stderr)
sys.exit(1)
try:
df_long = df.melt(id_vars=id_vars, value_vars=value_vars, var_name='variable', value_name='value')
except Exception as e:
print(f"Error converting to long format: {e}", file=sys.stderr)
sys.exit(1)
sns.set_theme(font_scale=2.0)
try:
if args.chart_type == 'scatter':
sns.scatterplot(data=df_long, x=x_axis_col, y='value', hue='variable')
elif args.chart_type == 'line':
sns.lineplot(data=df_long, x=x_axis_col, y='value', hue='variable')
elif args.chart_type == 'bar':
sns.barplot(data=df_long, x=x_axis_col, y='value', hue='variable', errorbar=None)
elif args.chart_type == 'hist':
sns.histplot(data=df_long, x='value', hue='variable', kde=True, multiple='stack')
elif args.chart_type == 'box':
sns.boxplot(data=df_long, x='variable', y='value')
elif args.chart_type == 'violin':
sns.violinplot(data=df_long, x='variable', y='value', inner="quartile")
elif args.chart_type == 'heatmap':
numeric_df = df.select_dtypes(include='number')
if numeric_df.empty:
print("Error: Heatmap needs numeric columns.", file=sys.stderr)
sys.exit(1)
plt.figure(figsize=(10, 6))
sns.heatmap(numeric_df, annot=True, cmap="coolwarm", fmt=".2f")
plt.xlabel(args.x_label or "")
plt.ylabel(args.y_label or "")
plt.title(args.title or "Heatmap")
if args.legend_labels:
print("Warning: --legend-labels ignored for heatmap.", file=sys.stderr)
maximize_plot_window()
plt.show()
return
except Exception as e:
print(f"Error during plotting: {e}", file=sys.stderr)
sys.exit(1)
legend_labels = None
if args.legend_labels:
legend_labels = args.legend_labels
elif args.y_label:
legend_labels = [args.y_label]
ax = plt.gca()
legend = ax.get_legend()
if legend and legend_labels:
for t, l in zip(legend.texts, legend_labels):
t.set_text(l)
if args.x_label:
plt.xlabel(args.x_label)
if args.y_label:
plt.ylabel(args.y_label)
if args.title:
plt.title(args.title)
else:
plt.title(f"{args.chart_type.capitalize()} Plot")
maximize_plot_window()
plt.show()
if __name__ == "__main__":
main()
数据量大的时候,图像画出的线显示看起来是透明的白色,如何修改解决
最新发布