win用 重複排除スクリプト 2
概要
いろいろ修正
hashmargedb.py
#!/usr/bin/python # -*- coding: cp932 -*- import sqlite3,sys,os,pickle,hashlib,base64 import win32file,locale,re def target_patterns(): return [ ".*\.7z$",".*\.aif$",".*\.aifc$",".*\.aiff$",".*\.arj$" ,".*\.asf$",".*\.asx$",".*\.au$",".*\.avi$",".*\.bmp$" ,".*\.bz2$",".*\.bzip2$",".*\.cab$",".*\.cda$",".*\.chm$" ,".*\.chw$",".*\.cpio$",".*\.cramfs$",".*\.deb$",".*\.dmg$" ,".*\.dvr-ms$",".*\.exe$",".*\.flac$",".*\.gif$" ,".*\.gz$",".*\.gzip$",".*\.hxs$",".*\.ico$",".*\.iso$" ,".*\.ivf$",".*\.jpeg$",".*\.jpg$",".*\.lha$",".*\.lzh$" ,".*\.lzma$",".*\.m1v$",".*\.m3u$",".*\.mbr$",".*\.mid$" ,".*\.midi$",".*\.mov$",".*\.mp2$",".*\.mp3$",".*\.mp4$" ,".*\.mpa$",".*\.mpe$",".*\.mpeg$",".*\.mpg$",".*\.mpv2$" ,".*\.msi$",".*\.png$",".*\.qt$",".*\.ra$" ,".*\.ram$",".*\.rar$",".*\.rm$",".*\.rmi$",".*\.rpm$" ,".*\.snd$",".*\.squashfs$",".*\.swm$",".*\.tar$",".*\.taz$" ,".*\.tbz$",".*\.tbz2$",".*\.tgz$",".*\.tiff$",".*\.wav$" ,".*\.wax$",".*\.wim$",".*\.wm$",".*\.wma$",".*\.wmd$" ,".*\.wms$",".*\.wmv$",".*\.wmz$",".*\.wpl$",".*\.wvx$" ,".*\.xar$",".*\.xz$",".*\.z$",".*\.zip$" ,".*.pdf$" ] pattern_re = [] for pat in target_patterns(): pattern_re.append(re.compile(pat)) def matches_file_pattern(fname): for p_re in pattern_re: if p_re.match(fname): return True return False def get_read_handle (filename): if os.path.isdir(filename): dwFlagsAndAttributes = win32file.FILE_FLAG_BACKUP_SEMANTICS else: dwFlagsAndAttributes = 0 return win32file.CreateFile ( filename, win32file.GENERIC_READ, win32file.FILE_SHARE_READ, None, win32file.OPEN_EXISTING, dwFlagsAndAttributes, None ) def get_unique_id (hFile): ( attributes, created_at, accessed_at, written_at, volume, file_hi, file_lo, n_links, index_hi, index_lo ) = win32file.GetFileInformationByHandle (hFile) return volume, index_hi, index_lo def files_are_equal (filename1, filename2): hFile1 = get_read_handle (filename1) hFile2 = get_read_handle (filename2) are_equal = (get_unique_id (hFile1) == get_unique_id (hFile2)) hFile2.Close () hFile1.Close () return are_equal class HmDBO: con = None def __init__(self,dbpath): self.con = sqlite3.connect(dbpath) if not self.existFileTable(): self.createFileTable() if not self.existCacheTable(): self.createCacheTable() def commit(self): self.con.commit() def close(self): self.con.close() self.con = None # #filehash # def existFileTable(self): sql = u"""SELECT * FROM sqlite_master WHERE type='table' AND name='filehash';""" c = self.con.execute(sql) for o in c: return True return False def createFileTable(self): sql = u"""create table filehash(path varchar(64) primary key, hash varchar(64));""" self.con.execute(sql) def insertFile(self,path,hash): sql = u"""insert into filehash values (?,?); """ hxpath = hashlib.sha256(path).hexdigest() self.con.execute(sql,(hxpath,hash)) def selectFile(self,path): sql = u"""select hash from filehash where path=?;""" hxpath = hashlib.sha256(path).hexdigest() c = self.con.execute(sql,(hxpath,)) for o in c: return o[0] return None # #linkcache # def existCacheTable(self): sql = u"""SELECT * FROM sqlite_master WHERE type='table' AND name='linkcache';""" c = self.con.execute(sql) for o in c: return True return False def createCacheTable(self): sql = u"""create table linkcache(hash varchar(64) primary key, path text);""" self.con.execute(sql) def insertCache(self,hash,path): sql = u"""insert into linkcache values (?,?)""" b64path = base64.b64encode(path) self.con.execute(sql,(hash,b64path)) def selectCache(self,hash): sql = u"""select path from linkcache where hash=?;""" c = self.con.execute(sql,(hash,)) for o in c: return base64.b64decode(o[0]) return None def deleteCache(self,hash): sql = u"""delete from linkcache where hash=?;""" self.con.execute(sql,(hash,)) def calcSHA256(path): sha = hashlib.sha256() fp = open(path,'r') while True: cache = fp.read(65536) if not cache: break sha.update(cache) fp.close() return sha.hexdigest() class TeeErr: fp = None def __init__(self,path): self.fp = open(path,'w') def write(self,line): self.fp.write(line) sys.stderr.write(line) def close(self): self.fp.close() def hashmarge(dbpath,log): filecount=0 margecount=0 registercount=0 skip=0 reduction=0 dbo = HmDBO(dbpath) tee = TeeErr(log) for rwln in iter(sys.stdin.readline,""): path = rwln.rstrip('\n') if os.path.isfile(path) and matches_file_pattern(os.path.basename(path)): tee.write(path + '...') filecount += 1 try: #filehash hash = dbo.selectFile(path) if hash == None: registercount += 1 tee.write('register ') hash = calcSHA256(path) dbo.insertFile(path,hash) #linkcache cache = dbo.selectCache(hash) if cache == None: dbo.insertCache(hash,path) elif os.path.isfile(cache) and not files_are_equal(cache, path): tee.write('marge ') os.remove(path) win32file.CreateHardLink (path, cache, None) reduction += os.path.getsize(path) margecount += 1 except: skip+=1 tee.write('skip ') tee.write('done\n') dbo.commit() dbo.close() tee.write('処理ファイル数 = %s\n' % str(filecount)) tee.write('データ内容を登録したファイル数 = %s\n' % str(registercount) ) tee.write('内容を共有させたファイル数 = %s\n' % str(margecount)) tee.write('処理をスキップしたファイル数 = %s\n' % str(skip)) tee.write('開放されたディスクサイズ = %s byte\n' % locale.currency(reduction, symbol=False, grouping=True)) tee.close() def main(): if len(sys.argv) < 3: exit() else: hashmarge(sys.argv[1],sys.argv[2]) if __name__ == '__main__': #locale.setlocale(locale.LC_NUMERIC, 'ja_JP') locale.setlocale(locale.LC_ALL, '') main()
kickstart.bat
IF "%1" EQU "" ( echo "ファイルをドロップしてください" ) ELSE ( ( for %%a in (%*) do dir /s/b %%a ) | C:\python27\python "%~dp0hashmargedb.py" "%~dp0hash.db" "%~dp0output.log" ) pause
あとがき
これで一応人に使わせられる状態になったかな?
追記
doc,xls,ppt も処理対象になっていたので取り除いた