win用 重複排除スクリプト 2

概要

いろいろ修正

  1. 複数ファイルのD&Dに対応
  2. オープン出来ないファイルをスキップ
  3. 処理対象の拡張子を限
  4. kickstartスクリプトで複数フォルダのD&Dに対応
  5. 処理結果を日本語に
  6. 膨大なファイル数に対応するため 1号をベースに作り直し

hashmargedb.py

#!/usr/bin/python
# -*- coding: cp932 -*-

import sqlite3,sys,os,pickle,hashlib,base64
import win32file,locale,re

def target_patterns():
  return [
    ".*\.7z$",".*\.aif$",".*\.aifc$",".*\.aiff$",".*\.arj$"
    ,".*\.asf$",".*\.asx$",".*\.au$",".*\.avi$",".*\.bmp$"
    ,".*\.bz2$",".*\.bzip2$",".*\.cab$",".*\.cda$",".*\.chm$"
    ,".*\.chw$",".*\.cpio$",".*\.cramfs$",".*\.deb$",".*\.dmg$"
    ,".*\.dvr-ms$",".*\.exe$",".*\.flac$",".*\.gif$"
    ,".*\.gz$",".*\.gzip$",".*\.hxs$",".*\.ico$",".*\.iso$"
    ,".*\.ivf$",".*\.jpeg$",".*\.jpg$",".*\.lha$",".*\.lzh$"
    ,".*\.lzma$",".*\.m1v$",".*\.m3u$",".*\.mbr$",".*\.mid$"
    ,".*\.midi$",".*\.mov$",".*\.mp2$",".*\.mp3$",".*\.mp4$"
    ,".*\.mpa$",".*\.mpe$",".*\.mpeg$",".*\.mpg$",".*\.mpv2$"
    ,".*\.msi$",".*\.png$",".*\.qt$",".*\.ra$"
    ,".*\.ram$",".*\.rar$",".*\.rm$",".*\.rmi$",".*\.rpm$"
    ,".*\.snd$",".*\.squashfs$",".*\.swm$",".*\.tar$",".*\.taz$"
    ,".*\.tbz$",".*\.tbz2$",".*\.tgz$",".*\.tiff$",".*\.wav$"
    ,".*\.wax$",".*\.wim$",".*\.wm$",".*\.wma$",".*\.wmd$"
    ,".*\.wms$",".*\.wmv$",".*\.wmz$",".*\.wpl$",".*\.wvx$"
    ,".*\.xar$",".*\.xz$",".*\.z$",".*\.zip$"
    ,".*.pdf$"
  ]

pattern_re = []
for pat in target_patterns():
  pattern_re.append(re.compile(pat))

def matches_file_pattern(fname):
  for p_re in pattern_re:
    if p_re.match(fname):
      return True
  return False

def get_read_handle (filename):
  if os.path.isdir(filename):
    dwFlagsAndAttributes = win32file.FILE_FLAG_BACKUP_SEMANTICS
  else:
    dwFlagsAndAttributes = 0
  return win32file.CreateFile (
    filename,
    win32file.GENERIC_READ,
    win32file.FILE_SHARE_READ,
    None,
    win32file.OPEN_EXISTING,
    dwFlagsAndAttributes,
    None
  )

def get_unique_id (hFile):
  (
    attributes,
    created_at, accessed_at, written_at,
    volume,
    file_hi, file_lo,
    n_links,
    index_hi, index_lo
  ) = win32file.GetFileInformationByHandle (hFile)
  return volume, index_hi, index_lo

def files_are_equal (filename1, filename2):
  hFile1 = get_read_handle (filename1)
  hFile2 = get_read_handle (filename2)
  are_equal = (get_unique_id (hFile1) == get_unique_id (hFile2))
  hFile2.Close ()
  hFile1.Close ()
  return are_equal

class HmDBO:
  con = None
  def __init__(self,dbpath):
    self.con = sqlite3.connect(dbpath)
    if not self.existFileTable():
      self.createFileTable()
    if not self.existCacheTable():
      self.createCacheTable()

  def commit(self):
    self.con.commit()

  def close(self):
    self.con.close()
    self.con = None

  #
  #filehash
  #
  def existFileTable(self):
    sql = u"""SELECT * FROM sqlite_master WHERE type='table' AND name='filehash';"""
    c = self.con.execute(sql)
    for o in c:
      return True
    return False

  def createFileTable(self):
    sql = u"""create table filehash(path varchar(64) primary key, hash varchar(64));"""
    self.con.execute(sql)

  def insertFile(self,path,hash):
    sql = u"""insert into filehash values (?,?); """
    hxpath = hashlib.sha256(path).hexdigest()
    self.con.execute(sql,(hxpath,hash))
  
  def selectFile(self,path):
    sql = u"""select hash from filehash where path=?;"""
    hxpath = hashlib.sha256(path).hexdigest()
    c = self.con.execute(sql,(hxpath,))
    for o in c:
      return o[0]
    return None

  #
  #linkcache
  #
  def existCacheTable(self):
    sql = u"""SELECT * FROM sqlite_master WHERE type='table' AND name='linkcache';"""
    c = self.con.execute(sql)
    for o in c:
      return True
    return False

  def createCacheTable(self):
    sql = u"""create table linkcache(hash varchar(64) primary key, path text);"""
    self.con.execute(sql)

  def insertCache(self,hash,path):
    sql = u"""insert into linkcache values (?,?)"""
    b64path = base64.b64encode(path)
    self.con.execute(sql,(hash,b64path))

  def selectCache(self,hash):
    sql = u"""select path from linkcache where hash=?;"""
    c = self.con.execute(sql,(hash,))
    for o in c:
      return base64.b64decode(o[0])
    return None

  def deleteCache(self,hash):
    sql = u"""delete from linkcache where hash=?;"""
    self.con.execute(sql,(hash,))

def calcSHA256(path):
  sha = hashlib.sha256()
  fp = open(path,'r')
  while True:
    cache = fp.read(65536)
    if not cache: break
    sha.update(cache)
  fp.close()
  return sha.hexdigest()

class TeeErr:
  fp = None
  def __init__(self,path):
    self.fp = open(path,'w')
  
  def write(self,line):
    self.fp.write(line)
    sys.stderr.write(line)
  
  def close(self):
    self.fp.close()

def hashmarge(dbpath,log):
  filecount=0
  margecount=0
  registercount=0
  skip=0
  reduction=0
  
  dbo = HmDBO(dbpath)
  tee = TeeErr(log)

  for rwln in iter(sys.stdin.readline,""):
    path = rwln.rstrip('\n')
    if os.path.isfile(path) and matches_file_pattern(os.path.basename(path)):
      tee.write(path + '...')
      filecount += 1
      try:
        #filehash
        hash = dbo.selectFile(path)
        if hash == None:
          registercount += 1
          tee.write('register ')
          hash = calcSHA256(path)
          dbo.insertFile(path,hash)
        #linkcache
        cache = dbo.selectCache(hash)
        if cache == None:
          dbo.insertCache(hash,path)
        elif os.path.isfile(cache) and not files_are_equal(cache, path):
          tee.write('marge ')
          os.remove(path)
          win32file.CreateHardLink (path, cache, None)
          reduction += os.path.getsize(path)
          margecount += 1
      except:
        skip+=1
        tee.write('skip ')

      tee.write('done\n')

  dbo.commit()
  dbo.close()

  tee.write('処理ファイル数                 = %s\n' % str(filecount))
  tee.write('データ内容を登録したファイル数 = %s\n' % str(registercount) )
  tee.write('内容を共有させたファイル数     = %s\n' % str(margecount))
  tee.write('処理をスキップしたファイル数   = %s\n' % str(skip))
  tee.write('開放されたディスクサイズ       = %s byte\n' % locale.currency(reduction, symbol=False, grouping=True))
  
  tee.close()

def main():
  if len(sys.argv) < 3:
    exit()
  else:
    hashmarge(sys.argv[1],sys.argv[2])

if __name__ == '__main__':
  #locale.setlocale(locale.LC_NUMERIC, 'ja_JP')
  locale.setlocale(locale.LC_ALL, '')
  main()

kickstart.bat

IF "%1" EQU "" (
  echo "ファイルをドロップしてください"
) ELSE (
  ( for %%a in (%*) do dir /s/b %%a ) | C:\python27\python "%~dp0hashmargedb.py" "%~dp0hash.db" "%~dp0output.log"
)
pause

あとがき

これで一応人に使わせられる状態になったかな?

追記

doc,xls,ppt も処理対象になっていたので取り除いた