重複排除スクリプト 2号

概要

重複ファイルの発見から重複の除去までを全自動で行うため

指定のファイルリストの重複チェックを行い、
見つけた重複ファイルは自動的に「ハードリンク」するスクリプトを書いた
半年以上動作しているのでそこそこ安定しているはず

「ハードリンク」の説明

例えば内容が同じ"./A/B/hoge.avi"と"./C/D/puga.avi"を「ハードリンク」すると
それぞれのパスはそのままで、ファイルの内容が共有される。
ショートカット機能との違いは「どっちも本物」な事。

スクリプト

以下のスクリプトを dedup.py 等として保存する

#!/usr/bin/python
# -*- coding: utf-8 -*-

import sqlite3,sys,os,pickle,hashlib,base64

class HmDBO:
  con = None
  def __init__(self,dbpath):
    self.con = sqlite3.connect(dbpath)
    if not self.existFileInfoTable():
      self.createFileInfoTable()

  def commit(self):
    self.con.commit()

  def close(self):
    self.con.close()
    self.con = None

#
#fileinfo
#
  def existFileInfoTable(self):
    sql = u"""SELECT * FROM sqlite_master WHERE type='table' AND name='fileinfo';"""
    c = self.con.execute(sql)
    for o in c:
      return True
    return False
  
  def createFileInfoTable(self):
    sql = u"""
      create table fileinfo(
        hxpath varchar(64) primary key,
        b64path text,
        size integer,
        hash varchar(64),
        st_dev integer,
        st_ino integer
        );"""
    self.con.execute(sql)
    sql = u"""create index sizeindex on fileinfo(size);"""
    self.con.execute(sql)
    sql = u"""create index devindex on fileinfo(st_dev,st_ino);"""
    self.con.execute(sql)
    sql = u"""create index size2index on fileinfo(size,hash,st_dev);"""
    self.con.execute(sql)
    sql = u"""create index inoindex on fileinfo(st_ino);"""
    self.con.execute(sql)
    sql = u"""create index hashindex on fileinfo(hash,st_dev);"""
    self.con.execute(sql)

  def insertFileInfo(self,path,hash,st):
    sql = u"""insert into fileinfo values (?,?,?,?,?,?); """
    hxpath = hashlib.sha256(path).hexdigest()
    b64path = base64.b64encode(path)
    self.con.execute(sql,(hxpath,b64path,st.st_size,hash,st.st_dev,st.st_ino))
  
  def updateFileInfo(self,path,hash,st):
    sql = u"""update fileinfo set size = ?, hash = ?,st_dev = ?,st_ino = ? where hxpath = ?; """
    hxpath = hashlib.sha256(path).hexdigest()
    self.con.execute(sql,(st.st_size,hash,st.st_dev,st.st_ino,hxpath))
  
  def deleteFileInfo(self,path):
    sql = u"""delete from fileinfo where hxpath = ?;"""
    hxpath = hashlib.sha256(path).hexdigest()
    self.con.execute(sql,(hxpath,))
    
  def selectFileInfo(self,path):
    sql = u"""select size,hash,st_dev,st_ino from fileinfo where hxpath=?;"""
    hxpath = hashlib.sha256(path).hexdigest()
    c = self.con.execute(sql,(hxpath,))
    for o in c:
      return (o[0],o[1],o[2],o[3])
    return (None,None,None,None)
  
  def searchHashCode(self,st):
    sql = u"""
      select b64path,hash from fileinfo
      where st_dev = ?
        and st_ino = ?
        and hash is not null ;""";
    c = self.con.execute(sql,(st.st_dev,st.st_ino))
    ret = list()
    for o in c:
      path =  base64.b64decode(o[0])
      hash = o[1]
      if os.path.isfile(path):
        return hash
      else:
        self.deleteFileInfo(path)
    return None

  def listupFileSize(self):
    sql = u"""
      select size from fileinfo 
      group by size 
      having count(size) > 1 
         and count(st_ino) > 1 ;"""
    c = self.con.execute(sql)
    for size in c:
      yield size[0]

  def searchFileFromSize(self,size):
    sql = u""" select b64path from fileinfo where size = ? ; """
    c = self.con.execute(sql,(size,))
    for o in c:
      yield base64.b64decode(o[0])
  
  def searchFileDupHash(self):
    sql = u"""
      select hash,st_dev from fileinfo
      group by size,hash,st_dev
      having count(st_ino) > 1
    ;"""
    c = self.con.execute(sql)
    for o in c:
      yield (o[0],o[1])
  
  def searchFileFromHashOne(self,hash,st_dev):
    sql = u"""
      select b64path,st_ino from fileinfo
       where hash   = ?
         and st_dev = ?  ; """
    c = self.con.execute(sql,(hash,st_dev))
    for o in c:
      return (base64.b64decode(o[0]), o[1])
    return None
  
  def searchDedupFiles(self,hash,st_dev,st_ino):
    sql = u"""
      select b64path from fileinfo
        where hash  = ?
          and st_dev = ?
          and st_ino != ? ;"""
    c = self.con.execute(sql,(hash,st_dev,st_ino))
    for o in c:
      yield base64.b64decode(o[0])

_registercount = 0

def calcSHA256(dbo,path,st):
  global _registercount
  uhash = dbo.searchHashCode(st)
  if uhash != None :
    return uhash
  sys.stderr.write('.')
  _registercount += 1
  sha = hashlib.sha256()
  fp = open(path,'r')
  while True:
    cache = fp.read(65536)
    if not cache: break
    sha.update(cache)
  fp.close()
  return sha.hexdigest()

def updateCache(dbo,path,calcHash):
  size,hash,st_dev,st_ino = dbo.selectFileInfo(path)
  st = os.stat(path)
  if size == None:
    if calcHash and hash == None:
      hash = calcSHA256(dbo,path,st)
      dbo.insertFileInfo(path,hash,st)
    else:
      dbo.insertFileInfo(path,None,st)
  else:
    if (calcHash and hash == None) or hash != None and (size != st.st_size or st_dev != st.st_dev or st_ino != st.st_ino):
      hash = calcSHA256(dbo,path,st)
      dbo.updateFileInfo(path,hash,st)
  return (st.st_size, hash, st.st_dev, st.st_ino)

def releaseVer():
  return True

def hashmarge(dbpath):
  margecount=0
  filecount=0
  global _registercount
  _registercount=0
  
  dbo = HmDBO(dbpath)
  #register size
  for rwln in iter(sys.stdin.readline,""):
    path = rwln.rstrip('\n')
    if os.path.isfile(path):
      sys.stderr.write('stat ' + path + '...')
      filecount += 1
      #fileinfo
      updateCache(dbo,path,False)
      sys.stderr.write(' done\n')
  dbo.commit()
  #update hash
  updates = list()
  for size in dbo.listupFileSize():
    for path in dbo.searchFileFromSize(size):
      updates.append(path)
  updates.sort()
  for path in updates:
    sys.stderr.write('read ' + path + '...')
    if os.path.isfile(path):
      updateCache(dbo,path,True)
      sys.stderr.write(' done\n')
    else:
      dbo.deleteFileInfo(path)
      sys.stderr.write(' not found\n')
  dbo.commit()
  #linking
  for hash,st_dev in dbo.searchFileDupHash():
    (centerPath,c_ino) = dbo.searchFileFromHashOne(hash,st_dev)
    for tpath in dbo.searchDedupFiles(hash,st_dev,c_ino):
      sys.stderr.write('link ' + tpath + '...')
      if releaseVer():
        os.remove(tpath)
        os.link(centerPath,tpath)
        updateCache(dbo,tpath,True)
      margecount += 1
      sys.stderr.write(' done\n')
 
  dbo.commit()
  dbo.close()

  sys.stderr.write('files  = ' + str(filecount) + '\n')
  sys.stderr.write('register = ' + str(_registercount) + '\n')
  sys.stderr.write('marges = ' + str(margecount) + '\n')


def main():
  if len(sys.argv) < 2:
    exit()
  elif len(sys.argv) == 2:
    hashmarge(sys.argv[1])

if __name__ == '__main__':
  main()

使い方

ファイルリストはパイプ経由で渡してやり、
DBファイルのパスをパラメータで指定してやる形で利用する

$ find <ターゲットディレクトリ> | dedup.py <dbファイルのパス>

動作概要

受け取ったファイルリストのファイルサイズをdbに格納
db中の同一サイズのファイルを読み重複チェック
重複していたファイルをハードリンク

注意点

ファイル数が５万以下ならそこそこ使えますが、
ファイル数があまりにも多くなると
内容が異なるファイル同士でファイルサイズの被りが発生するせいで
処理が遅くなってしまいます。

os.stat()が未対応のため windowsでは動作しませんでした