#!/usr/bin/env python3
# diff_keep1_not2.py
# 说明:保留 1.txt 中存在但 2.txt 中不存在的行(比较时忽略行首尾空白)。
# 用法:python diff_keep1_not2.py 1.txt 2.txt -o out.txt [--overwrite]
import argparse
import sys
from pathlib import Path
def main():
parser = argparse.ArgumentParser(description="保留1.txt中存在但2.txt不存在的行(按行比较,忽略首尾空白)")
parser.add_argument("file1", help="路径到 1.txt(要保留来源)")
parser.add_argument("file2", help="路径到 2.txt(要排除的内容)")
parser.add_argument("-o", "--out", help="输出文件(默认 output.txt)", default="output.txt")
parser.add_argument("--overwrite", help="直接覆盖 file1(小心使用)", action="store_true")
args = parser.parse_args()
p1 = Path(args.file1)
p2 = Path(args.file2)
outp = Path(args.out)
if not p1.exists():
print(f"错误:找不到 {p1}", file=sys.stderr); sys.exit(2)
if not p2.exists():
print(f"错误:找不到 {p2}", file=sys.stderr); sys.exit(2)
if args.overwrite and outp.resolve() == p1.resolve():
# OK: user explicitly asked to overwrite file1
pass
elif outp.exists() and outp.resolve() in (p1.resolve(), p2.resolve()):
print("错误:输出文件会覆盖输入文件。要覆盖请使用 --overwrite 或换一个输出文件名。", file=sys.stderr)
sys.exit(2)
# 读取 file2 到集合(strip 后比较)
try:
with p2.open("r", encoding="utf-8") as f2:
set2 = {line.strip() for line in f2 if line.strip() != ""}
except UnicodeDecodeError:
# 尝试用 gbk 兼容一些 Windows 文本
with p2.open("r", encoding="gbk", errors="ignore") as f2:
set2 = {line.strip() for line in f2 if line.strip() != ""}
kept_count = 0
total_count = 0
# 逐行扫描 file1,写入不在 set2 的行(写入时保留原行,包括末尾换行)
try:
with p1.open("r", encoding="utf-8") as f1:
lines1 = f1.readlines()
except UnicodeDecodeError:
with p1.open("r", encoding="gbk", errors="ignore") as f1:
lines1 = f1.readlines()
# 如果要覆盖原文件,先写到临时文件再替换(安全)
tmp_out = outp
if args.overwrite:
tmp_out = outp.parent / (outp.name + ".tmp")
with tmp_out.open("w", encoding="utf-8", newline="") as fout:
for line in lines1:
total_count += 1
if line.strip() == "":
# 如果你想保留空行也可以改这里,这里默认不写入空白行
continue
if line.strip() not in set2:
fout.write(line)
kept_count += 1
# 如果 overwrite:替换原文件
if args.overwrite:
tmp_out.replace(p1)
print(f"已覆盖 {p1}:保留了 {kept_count} / {total_count} 行(不含空行)。")
else:
print(f"输出写入 {tmp_out}。保留了 {kept_count} / {total_count} 行(不含空行)。")
if __name__ == "__main__":
main()
|