win用 重複排除スクリプト
動作に必要なランタイム
python 2.7 (32bit版)
Python Release Python 2.7.3 | Python.org
ここからWindows x86 MSI Installer (2.7.3) よりダウンロードする
win32extensions for python
http://starship.python.net/~skippy/win32/Downloads.html
ここから、python 2.7(32bit版)対応のバイナリをダウンロードする。
dedupwin.py
#!/usr/bin/python # -*- coding: utf-8 -*- import sqlite3,sys,os,pickle,hashlib,base64 import win32file def get_read_handle (filename): if os.path.isdir(filename): dwFlagsAndAttributes = win32file.FILE_FLAG_BACKUP_SEMANTICS else: dwFlagsAndAttributes = 0 return win32file.CreateFile ( filename, win32file.GENERIC_READ, win32file.FILE_SHARE_READ, None, win32file.OPEN_EXISTING, dwFlagsAndAttributes, None ) class dmyStat: st_size = 0 st_dev = 0 st_ino = 0 def __init__(self, path): hFile = get_read_handle (path) ( attributes, created_at, accessed_at, written_at, volume, file_hi, file_lo, n_links, index_hi, index_lo ) = win32file.GetFileInformationByHandle (hFile) hFile.Close () self.st_size = file_hi * (2**32) + file_lo self.st_dev = volume self.st_ino = index_hi * (2**30) + index_lo class HmDBO: con = None def __init__(self,dbpath): self.con = sqlite3.connect(dbpath) if not self.existFileInfoTable(): self.createFileInfoTable() def commit(self): self.con.commit() def close(self): self.con.close() self.con = None # #fileinfo # def existFileInfoTable(self): sql = u"""SELECT * FROM sqlite_master WHERE type='table' AND name='fileinfo';""" c = self.con.execute(sql) for o in c: return True return False def createFileInfoTable(self): sql = u""" create table fileinfo( hxpath varchar(64) primary key, b64path text, size integer, hash varchar(64), st_dev integer, st_ino integer );""" self.con.execute(sql) sql = u"""create index sizeindex on fileinfo(size);""" self.con.execute(sql) sql = u"""create index devindex on fileinfo(st_dev,st_ino);""" self.con.execute(sql) sql = u"""create index size2index on fileinfo(size,hash,st_dev);""" self.con.execute(sql) sql = u"""create index inoindex on fileinfo(st_ino);""" self.con.execute(sql) sql = u"""create index hashindex on fileinfo(hash,st_dev);""" self.con.execute(sql) def insertFileInfo(self,path,hash,st): sql = u"""insert into fileinfo values (?,?,?,?,?,?); """ hxpath = hashlib.sha256(path).hexdigest() b64path = base64.b64encode(path) self.con.execute(sql,(hxpath,b64path,st.st_size,hash,st.st_dev,st.st_ino)) def updateFileInfo(self,path,hash,st): sql = u"""update fileinfo set size = ?, hash = ?,st_dev = ?,st_ino = ? where hxpath = ?; """ hxpath = hashlib.sha256(path).hexdigest() self.con.execute(sql,(st.st_size,hash,st.st_dev,st.st_ino,hxpath)) def deleteFileInfo(self,path): sql = u"""delete from fileinfo where hxpath = ?;""" hxpath = hashlib.sha256(path).hexdigest() self.con.execute(sql,(hxpath,)) def selectFileInfo(self,path): sql = u"""select size,hash,st_dev,st_ino from fileinfo where hxpath=?;""" hxpath = hashlib.sha256(path).hexdigest() c = self.con.execute(sql,(hxpath,)) for o in c: return (o[0],o[1],o[2],o[3]) return (None,None,None,None) def searchHashCode(self,st): sql = u""" select b64path,hash from fileinfo where st_dev = ? and st_ino = ? and hash is not null ;"""; c = self.con.execute(sql,(st.st_dev,st.st_ino)) ret = list() for o in c: path = base64.b64decode(o[0]) hash = o[1] if os.path.isfile(path): return hash else: self.deleteFileInfo(path) return None def listupFileSize(self): sql = u""" select size from fileinfo group by size having count(size) > 1 and count(st_ino) > 1 ;""" c = self.con.execute(sql) for size in c: yield size[0] def searchFileFromSize(self,size): sql = u""" select b64path from fileinfo where size = ? ; """ c = self.con.execute(sql,(size,)) for o in c: yield base64.b64decode(o[0]) def searchFileDupHash(self): sql = u""" select hash,st_dev from fileinfo group by size,hash,st_dev having count(st_ino) > 1 ;""" c = self.con.execute(sql) for o in c: yield (o[0],o[1]) def searchFileFromHashOne(self,hash,st_dev): sql = u""" select b64path,st_ino from fileinfo where hash = ? and st_dev = ? ; """ c = self.con.execute(sql,(hash,st_dev)) for o in c: return (base64.b64decode(o[0]), o[1]) return None def searchDedupFiles(self,hash,st_dev,st_ino): sql = u""" select b64path from fileinfo where hash = ? and st_dev = ? and st_ino != ? ;""" c = self.con.execute(sql,(hash,st_dev,st_ino)) for o in c: yield base64.b64decode(o[0]) _registercount = 0 def calcSHA256(dbo,path,st): global _registercount uhash = dbo.searchHashCode(st) if uhash != None : return uhash sys.stderr.write('.') _registercount += 1 sha = hashlib.sha256() fp = open(path,'r') while True: cache = fp.read(65536) if not cache: break sha.update(cache) fp.close() return sha.hexdigest() def updateCache(dbo,path,calcHash): size,hash,st_dev,st_ino = dbo.selectFileInfo(path) # st = os.stat(path) st = dmyStat(path) if size == None: if calcHash and hash == None: hash = calcSHA256(dbo,path,st) dbo.insertFileInfo(path,hash,st) else: dbo.insertFileInfo(path,None,st) else: if (calcHash and hash == None) or hash != None and (size != st.st_size or st_dev != st.st_dev or st_ino != st.st_ino): hash = calcSHA256(dbo,path,st) dbo.updateFileInfo(path,hash,st) return (st.st_size, hash, st.st_dev, st.st_ino) def releaseVer(): return True def hashmarge(dbpath): margecount=0 filecount=0 global _registercount _registercount=0 dbo = HmDBO(dbpath) #register size for rwln in iter(sys.stdin.readline,""): path = rwln.rstrip('\n') if os.path.isfile(path): sys.stderr.write('stat ' + path + '...') filecount += 1 #fileinfo updateCache(dbo,path,False) sys.stderr.write(' done\n') dbo.commit() #update hash updates = list() for size in dbo.listupFileSize(): for path in dbo.searchFileFromSize(size): updates.append(path) updates.sort() for path in updates: sys.stderr.write('read ' + path + '...') if os.path.isfile(path): updateCache(dbo,path,True) sys.stderr.write(' done\n') else: dbo.deleteFileInfo(path) sys.stderr.write(' not found\n') dbo.commit() #linking for hash,st_dev in dbo.searchFileDupHash(): (centerPath,c_ino) = dbo.searchFileFromHashOne(hash,st_dev) print "hash=%s,st_dev=%s,st_ino=%s,centerPath=%s" % (hash,st_dev,c_ino,centerPath) for tpath in dbo.searchDedupFiles(hash,st_dev,c_ino): sys.stderr.write('link ' + tpath + '...') if releaseVer(): os.remove(tpath) #os.link(centerPath,tpath) win32file.CreateHardLink (tpath, centerPath, None) updateCache(dbo,tpath,True) margecount += 1 sys.stderr.write(' done\n') dbo.commit() dbo.close() sys.stderr.write('files = ' + str(filecount) + '\n') sys.stderr.write('register = ' + str(_registercount) + '\n') sys.stderr.write('marges = ' + str(margecount) + '\n') def main(): if len(sys.argv) < 2: exit() elif len(sys.argv) == 2: hashmarge(sys.argv[1]) if __name__ == '__main__': main()
このファイルを適当な位置に設置する
起動用バッチファイル
windowsでも使いやすいように以下のようなバッチファイルを用意した
dir /s/b %1 | C:\python27\python "%~dp0dedupwin.py" "%~dp0dedup.db" pause
あとはこのbatファイルに、重複排除したいフォルダをドラック&ドロップするだけで実行できる
(2013-03-07) 修正
・file_hiのmsbが1の場合に例外終了していたので修正
・バッチファイルにスペースが含まれるパスがD&Dされた場合の修正