win用 重複排除スクリプト

概要

前々回のエントリで紹介した 重複排除スクリプト2号を windows対応させた。

動作に必要なランタイム

python 2.7 (32bit版)

Python Release Python 2.7.3 | Python.org
ここからWindows x86 MSI Installer (2.7.3) よりダウンロードする

win32extensions for python

http://starship.python.net/~skippy/win32/Downloads.html
ここから、python 2.7(32bit版)対応のバイナリをダウンロードする。

dedupwin.py

#!/usr/bin/python

# -*- coding: utf-8 -*-

import sqlite3,sys,os,pickle,hashlib,base64

import win32file

def get_read_handle (filename):
  if os.path.isdir(filename):
    dwFlagsAndAttributes = win32file.FILE_FLAG_BACKUP_SEMANTICS
  else:
    dwFlagsAndAttributes = 0
  return win32file.CreateFile (
    filename,
    win32file.GENERIC_READ,
    win32file.FILE_SHARE_READ,
    None,
    win32file.OPEN_EXISTING,
    dwFlagsAndAttributes,
    None
  )

class dmyStat:
  st_size = 0
  st_dev = 0
  st_ino = 0
  
  def __init__(self, path):
    hFile = get_read_handle (path)
    (
      attributes,
      created_at, accessed_at, written_at,
      volume,
      file_hi, file_lo,
      n_links,
      index_hi, index_lo
    ) = win32file.GetFileInformationByHandle (hFile)
    hFile.Close ()
    self.st_size = file_hi * (2**32) + file_lo
    self.st_dev = volume
    self.st_ino = index_hi * (2**30) + index_lo

class HmDBO:
  con = None
  def __init__(self,dbpath):
    self.con = sqlite3.connect(dbpath)
    if not self.existFileInfoTable():
      self.createFileInfoTable()

  def commit(self):
    self.con.commit()

  def close(self):
    self.con.close()
    self.con = None

#
#fileinfo
#
  def existFileInfoTable(self):
    sql = u"""SELECT * FROM sqlite_master WHERE type='table' AND name='fileinfo';"""
    c = self.con.execute(sql)
    for o in c:
      return True
    return False
  
  def createFileInfoTable(self):
    sql = u"""
      create table fileinfo(
        hxpath varchar(64) primary key,
        b64path text,
        size integer,
        hash varchar(64),
        st_dev integer,
        st_ino integer
        );"""
    self.con.execute(sql)
    sql = u"""create index sizeindex on fileinfo(size);"""
    self.con.execute(sql)
    sql = u"""create index devindex on fileinfo(st_dev,st_ino);"""
    self.con.execute(sql)
    sql = u"""create index size2index on fileinfo(size,hash,st_dev);"""
    self.con.execute(sql)
    sql = u"""create index inoindex on fileinfo(st_ino);"""
    self.con.execute(sql)
    sql = u"""create index hashindex on fileinfo(hash,st_dev);"""
    self.con.execute(sql)

  def insertFileInfo(self,path,hash,st):
    sql = u"""insert into fileinfo values (?,?,?,?,?,?); """
    hxpath = hashlib.sha256(path).hexdigest()
    b64path = base64.b64encode(path)
    self.con.execute(sql,(hxpath,b64path,st.st_size,hash,st.st_dev,st.st_ino))
  
  def updateFileInfo(self,path,hash,st):
    sql = u"""update fileinfo set size = ?, hash = ?,st_dev = ?,st_ino = ? where hxpath = ?; """
    hxpath = hashlib.sha256(path).hexdigest()
    self.con.execute(sql,(st.st_size,hash,st.st_dev,st.st_ino,hxpath))
  
  def deleteFileInfo(self,path):
    sql = u"""delete from fileinfo where hxpath = ?;"""
    hxpath = hashlib.sha256(path).hexdigest()
    self.con.execute(sql,(hxpath,))
    
  def selectFileInfo(self,path):
    sql = u"""select size,hash,st_dev,st_ino from fileinfo where hxpath=?;"""
    hxpath = hashlib.sha256(path).hexdigest()
    c = self.con.execute(sql,(hxpath,))
    for o in c:
      return (o[0],o[1],o[2],o[3])
    return (None,None,None,None)
  
  def searchHashCode(self,st):
    sql = u"""
      select b64path,hash from fileinfo
      where st_dev = ?
        and st_ino = ?
        and hash is not null ;""";
    c = self.con.execute(sql,(st.st_dev,st.st_ino))
    ret = list()
    for o in c:
      path =  base64.b64decode(o[0])
      hash = o[1]
      if os.path.isfile(path):
        return hash
      else:
        self.deleteFileInfo(path)
    return None

  def listupFileSize(self):
    sql = u"""
      select size from fileinfo 
      group by size 
      having count(size) > 1 
         and count(st_ino) > 1 ;"""
    c = self.con.execute(sql)
    for size in c:
      yield size[0]

  def searchFileFromSize(self,size):
    sql = u""" select b64path from fileinfo where size = ? ; """
    c = self.con.execute(sql,(size,))
    for o in c:
      yield base64.b64decode(o[0])
  
  def searchFileDupHash(self):
    sql = u"""
      select hash,st_dev from fileinfo
      group by size,hash,st_dev
      having count(st_ino) > 1
    ;"""
    c = self.con.execute(sql)
    for o in c:
      yield (o[0],o[1])
  
  def searchFileFromHashOne(self,hash,st_dev):
    sql = u"""
      select b64path,st_ino from fileinfo
       where hash   = ?
         and st_dev = ?  ; """
    c = self.con.execute(sql,(hash,st_dev))
    for o in c:
      return (base64.b64decode(o[0]), o[1])
    return None
  
  def searchDedupFiles(self,hash,st_dev,st_ino):
    sql = u"""
      select b64path from fileinfo
        where hash  = ?
          and st_dev = ?
          and st_ino != ? ;"""
    c = self.con.execute(sql,(hash,st_dev,st_ino))
    for o in c:
      yield base64.b64decode(o[0])

_registercount = 0

def calcSHA256(dbo,path,st):
  global _registercount
  uhash = dbo.searchHashCode(st)
  if uhash != None :
    return uhash
  sys.stderr.write('.')
  _registercount += 1
  sha = hashlib.sha256()
  fp = open(path,'r')
  while True:
    cache = fp.read(65536)
    if not cache: break
    sha.update(cache)
  fp.close()
  return sha.hexdigest()

def updateCache(dbo,path,calcHash):
  size,hash,st_dev,st_ino = dbo.selectFileInfo(path)
# st = os.stat(path)
  st = dmyStat(path)
  
  if size == None:
    if calcHash and hash == None:
      hash = calcSHA256(dbo,path,st)
      dbo.insertFileInfo(path,hash,st)
    else:
      dbo.insertFileInfo(path,None,st)
  else:
    if (calcHash and hash == None) or hash != None and (size != st.st_size or st_dev != st.st_dev or st_ino != st.st_ino):
      hash = calcSHA256(dbo,path,st)
      dbo.updateFileInfo(path,hash,st)
  return (st.st_size, hash, st.st_dev, st.st_ino)

def releaseVer():
  return True

def hashmarge(dbpath):
  margecount=0
  filecount=0
  global _registercount
  _registercount=0
  
  dbo = HmDBO(dbpath)
  #register size
  for rwln in iter(sys.stdin.readline,""):
    path = rwln.rstrip('\n')
    if os.path.isfile(path):
      sys.stderr.write('stat ' + path + '...')
      filecount += 1
      #fileinfo
      updateCache(dbo,path,False)
      sys.stderr.write(' done\n')
  dbo.commit()
  #update hash
  updates = list()
  for size in dbo.listupFileSize():
    for path in dbo.searchFileFromSize(size):
      updates.append(path)
  updates.sort()
  for path in updates:
    sys.stderr.write('read ' + path + '...')
    if os.path.isfile(path):
      updateCache(dbo,path,True)
      sys.stderr.write(' done\n')
    else:
      dbo.deleteFileInfo(path)
      sys.stderr.write(' not found\n')
  dbo.commit()
  
  #linking
  for hash,st_dev in dbo.searchFileDupHash():
    (centerPath,c_ino) = dbo.searchFileFromHashOne(hash,st_dev)
    print "hash=%s,st_dev=%s,st_ino=%s,centerPath=%s" % (hash,st_dev,c_ino,centerPath)
    for tpath in dbo.searchDedupFiles(hash,st_dev,c_ino):
      sys.stderr.write('link ' + tpath + '...')
      if releaseVer():
        os.remove(tpath)
        #os.link(centerPath,tpath)
        win32file.CreateHardLink (tpath, centerPath, None)
        updateCache(dbo,tpath,True)
      margecount += 1
      sys.stderr.write(' done\n')
 
  dbo.commit()
  dbo.close()

  sys.stderr.write('files  = ' + str(filecount) + '\n')
  sys.stderr.write('register = ' + str(_registercount) + '\n')
  sys.stderr.write('marges = ' + str(margecount) + '\n')


def main():
  if len(sys.argv) < 2:
    exit()
  elif len(sys.argv) == 2:
    hashmarge(sys.argv[1])

if __name__ == '__main__':
  main()

このファイルを適当な位置に設置する

起動用バッチファイル

windowsでも使いやすいように以下のようなバッチファイルを用意した

dir /s/b %1 | C:\python27\python "%~dp0dedupwin.py" "%~dp0dedup.db"

pause

あとはこのbatファイルに、重複排除したいフォルダをドラック&ドロップするだけで実行できる

(2013-03-07) 修正

・file_hiのmsbが1の場合に例外終了していたので修正
・バッチファイルにスペースが含まれるパスがD&Dされた場合の修正