对比两个csv文件记录差异

清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>

# -*- coding:utf-8 -*-  
import csv
import os
class csvread():
    def __init__(self,filename,lstname):
        self.lstname = lstname#lstname 用于比对csv中相同列
        self.filename = filename
        self.header = self.fields()
        self.data = self.getdata()
        self.dataset = self.getset()

#test update		
#以字典方式获取csv数据
    def getdata(self):
        data = []
        with open(self.filename,newline = "") as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                data.append(row)
        return data
#获取序列
    def getset(self):
        lst = set()
        for row in self.data:
            lst.add(row.get(self.lstname))

        return lst

#获取表头
    def fields(self):
        with open(self.filename,newline = "") as csvfile:
            reader = csv.reader(csvfile)
            fields = reader.__next__()

            return fields

    def __sub__(self,other):
        #diff表示公共序列的差集,即表1中存在而表2不存在的列
        diff = self.dataset - other.dataset
        rdata = []
       #根据diff生产基于表1的记录 
        for row in self.data:
            if row[self.lstname] in diff:
                rdata.append(row)

        f1 = os.path.basename(self.filename)
        f2 = os.path.basename(other.filename)
        f1 = f1.split('.')[0]
        f2 = f2.split('.')[0]

        filename = f1+'_'+f2+'.csv'
        filename = os.path.join(os.path.dirname(self.filename),filename)
       #写入csv文件 
        with open(filename,'w',newline="") as csvfile:
            writer = csv.DictWriter(csvfile,fieldnames=self.header)
            writer.writeheader()
            writer.writerows(rdata)    
        return filename

if __name__ == '__main__':
    import sys
#print(sys.argv[0])
#for i in sys.argv:

    filename1 = sys.argv[1]
    print(filename1)
    filename2 = sys.argv[2]

    print(filename1,filename2)

    file1 = csvread(filename1)
    file2 = csvread(filename2)
    file1-file2
    file2-file1