--- /dev/null
+#!/usr/bin/env python2.5
+
+MINFILE_SIZE = 1024
+FILEBUFFER_SIZE = 1024**2
+
+import os, sys, bisect
+
+from copy import copy
+from base64 import standard_b64encode as b64encode
+from collections import defaultdict
+import cPickle
+try:
+ import hashlib
+ def _getSha1(filename):
+ return hashlib.sha1()
+except ImportError:
+ import sha
+ def _getSha1(filename):
+ return sha.new()
+def getSha1(filename):
+ sha1 = _getSha1(filename)
+ f = file(filename, 'r')
+ data = f.read(FILEBUFFER_SIZE)
+ while data:
+ sha1.update(data)
+ data = f.read(FILEBUFFER_SIZE)
+ return b64encode(sha1.digest())
+
+try:
+ import psyco
+ psyco.full()
+except ImportError:
+ print >>sys.stderr, "WARNING: Could not load psyco"
+
+class DiskObject(object):
+ def __repr__(self):
+ return repr(self.getFullPath())
+ def __str__(self):
+ return self.getFullPath()
+ def __lt__(self, other):
+ if not hasattr(other, 'getFullPath'):
+ raise NotImplemented()
+ return self.getFullPath() < other.getFullPath()
+ def __eq__(self, other):
+ if not hasattr(other, 'getFullPath'):
+ raise NotImplemented()
+ return self.getFullPath() == other.getFullPath()
+ def __hash__(self):
+ return hash(self.getFullPath())
+
+class Folder(DiskObject):
+ def __init__(self, name, parent = None):
+ if name.find(os.path.sep) >= 0 and name != os.path.sep:
+ print name
+ parent_name, name = os.path.split(name)
+ parent = Folder(parent_name, parent)
+
+ self.name = name
+ self.parent = parent
+ if parent:
+ parent.addChild(self)
+ self.children = {}
+ def getFullPath(self):
+ folderStack = []
+ f = self
+ while f:
+ folderStack.append(f.name)
+ f = f.parent
+ return os.path.sep.join(reversed(folderStack))
+ def addChild(self, child):
+ self.children[child.name] = child
+
+def findDirectory(rootDir, dirName, createNonExistant = False):
+ dir = dirName.split(os.path.sep)[1:]
+ if dir == ['']:
+ dir = []
+
+ ret = rootDir
+ for folderName in dir:
+ try:
+ ret = ret.children[folderName]
+ except KeyError, e:
+ if not createNonExistant:
+ raise e
+ ret = Folder(folderName, ret)
+
+ return ret
+
+class FileObject(DiskObject):
+ def __init__(self, name, folder):
+ self.name = name
+ self.folder = folder
+ statinfo = os.stat(self.getFullPath())
+ self.mtime_size = (statinfo.st_mtime, statinfo.st_size)
+ def getDiskID(self):
+ statinfo = os.stat(self.getFullPath())
+ return (statinfo.st_dev, statinfo.st_ino) # Identify the file
+ def get_mtime_size(self):
+ return self.mtime_size
+ def getFullPath(self):
+ return '%(folder)s/%(file)s' % { 'folder': self.folder.getFullPath(), 'file': self.name }
+
+class GlobalFileInfo(object):
+ def __init__(self):
+ self.files = defaultdict(list)
+ self.filelist = {}
+ self.root = Folder('')
+
+ def _scanDirUpdateFile(self, dirObject, dirPath, filename):
+ def printPath(word):
+ print '%s "%s"' % (word, filename[-80:])
+ fullpath = os.path.join(dirPath, filename)
+ if os.path.islink(fullpath) or not os.path.isfile(fullpath):
+ printPath('Skipping')
+ return
+ try:
+ file = FileObject(filename, dirObject)
+ new_mtime_size = file.get_mtime_size()
+
+ if file in self.filelist:
+ if file.get_mtime_size() == self.filelist[file].get_mtime_size():
+ printPath('Skipping')
+ return
+ old_sha1 = self.filelist[file].sha1
+ del self.filelist[file]
+ self.files[old_sha1].remove(file)
+
+ if file.get_mtime_size()[1] < MINFILE_SIZE:
+ printPath('Skipping')
+ return
+ printPath('Scanning')
+
+ file.sha1 = getSha1(fullpath)
+ self.files[file.sha1].append(file)
+ self.filelist[file] = file
+ except IOError:
+ print >>sys.stderr, 'WARNING: Could not get sha1 of "%s"\n' % (fullpath)
+
+ def scanDir(self, dirName):
+ root = findDirectory(self.root, dirName, createNonExistant = True)
+
+ for dirPath, dirs, files in os.walk(dirName):
+ print 'Scanning directory "%s"\n' % dirPath
+ folder = findDirectory(self.root, dirPath, createNonExistant = True)
+ # Add the children Directories
+ if '.svn' in dirs:
+ dirs.remove('.svn')
+ for d in dirs:
+ Folder(d, folder) # As a side effect, this is added to the parent correctly
+
+ for f in files:
+ sys.stdout.write("\033[A\033[300D\033[2K")
+ self._scanDirUpdateFile(folder, dirPath, f)
+ sys.stdout.write("\033[A\033[100D\033[2K")
+ def findDuplicates(self):
+ return [(sha1, list(filenames)) for sha1, filenames in self.files.items() if len(filenames) > 1]
+
+def main():
+ try:
+ files = cPickle.load(open(sys.argv[1]))
+ except IOError:
+ files = GlobalFileInfo()
+
+ for dir in sys.argv[2:]:
+ if dir[-1] == '/':
+ dir = dir[:-1]
+ files.scanDir(dir)
+
+ cPickle.dump(files, open(sys.argv[1], 'wb'), 2)
+ print "Done"
+
+### print files.files
+
+if __name__ == "__main__":
+ main()