+++ /dev/null
-#!/usr/bin/env python2.5
-
-MINFILE_SIZE = 1024
-FILEBUFFER_SIZE = 1024**2
-
-import os, sys, bisect
-
-from copy import copy
-from base64 import standard_b64encode as b64encode
-from collections import defaultdict
-import cPickle
-try:
- import hashlib
- def _getSha1(filename):
- return hashlib.sha1()
-except ImportError:
- import sha
- def _getSha1(filename):
- return sha.new()
-def getSha1(filename):
- sha1 = _getSha1(filename)
- f = file(filename, 'r')
- data = f.read(FILEBUFFER_SIZE)
- while data:
- sha1.update(data)
- data = f.read(FILEBUFFER_SIZE)
- return b64encode(sha1.digest())
-
-try:
- import psyco
- psyco.full()
-except ImportError:
- print >>sys.stderr, "WARNING: Could not load psyco"
-
-class DiskObject(object):
- def __repr__(self):
- return repr(self.getFullPath())
- def __str__(self):
- return self.getFullPath()
- def __lt__(self, other):
- if not hasattr(other, 'getFullPath'):
- raise NotImplemented()
- return self.getFullPath() < other.getFullPath()
- def __eq__(self, other):
- if not hasattr(other, 'getFullPath'):
- raise NotImplemented()
- return self.getFullPath() == other.getFullPath()
- def __hash__(self):
- return hash(self.getFullPath())
-
-class Folder(DiskObject):
- def __init__(self, name, parent = None):
- if name.find(os.path.sep) >= 0 and name != os.path.sep:
- print name
- parent_name, name = os.path.split(name)
- parent = Folder(parent_name, parent)
-
- self.name = name
- self.parent = parent
- if parent:
- parent.addChild(self)
- self.children = {}
- def getFullPath(self):
- folderStack = []
- f = self
- while f:
- folderStack.append(f.name)
- f = f.parent
- return os.path.sep.join(reversed(folderStack))
- def addChild(self, child):
- self.children[child.name] = child
-
-def findDirectory(rootDir, dirName, createNonExistant = False):
- dir = dirName.split(os.path.sep)[1:]
- if dir == ['']:
- dir = []
-
- ret = rootDir
- for folderName in dir:
- try:
- ret = ret.children[folderName]
- except KeyError, e:
- if not createNonExistant:
- raise e
- ret = Folder(folderName, ret)
-
- return ret
-
-class FileObject(DiskObject):
- def __init__(self, name, folder):
- self.name = name
- self.folder = folder
- statinfo = os.stat(self.getFullPath())
- self.mtime_size = (statinfo.st_mtime, statinfo.st_size)
- def getDiskID(self):
- statinfo = os.stat(self.getFullPath())
- return (statinfo.st_dev, statinfo.st_ino) # Identify the file
- def get_mtime_size(self):
- return self.mtime_size
- def getFullPath(self):
- return '%(folder)s/%(file)s' % { 'folder': self.folder.getFullPath(), 'file': self.name }
-
-class GlobalFileInfo(object):
- def __init__(self):
- self.files = defaultdict(list)
- self.filelist = {}
- self.root = Folder('')
-
- def _scanDirUpdateFile(self, dirObject, dirPath, filename):
- def printPath(word):
- print '%s "%s"' % (word, filename[-80:])
- fullpath = os.path.join(dirPath, filename)
- if os.path.islink(fullpath) or not os.path.isfile(fullpath):
- printPath('Skipping')
- return
- try:
- file = FileObject(filename, dirObject)
- new_mtime_size = file.get_mtime_size()
-
- if file in self.filelist:
- if file.get_mtime_size() == self.filelist[file].get_mtime_size():
- printPath('Skipping')
- return
- old_sha1 = self.filelist[file].sha1
- del self.filelist[file]
- self.files[old_sha1].remove(file)
-
- if file.get_mtime_size()[1] < MINFILE_SIZE:
- printPath('Skipping')
- return
- printPath('Scanning')
-
- file.sha1 = getSha1(fullpath)
- self.files[file.sha1].append(file)
- self.filelist[file] = file
- except IOError:
- print >>sys.stderr, 'WARNING: Could not get sha1 of "%s"\n' % (fullpath)
-
- def scanDir(self, dirName):
- root = findDirectory(self.root, dirName, createNonExistant = True)
-
- for dirPath, dirs, files in os.walk(dirName):
- print 'Scanning directory "%s"\n' % dirPath
- folder = findDirectory(self.root, dirPath, createNonExistant = True)
- # Add the children Directories
- if '.svn' in dirs:
- dirs.remove('.svn')
- for d in dirs:
- Folder(d, folder) # As a side effect, this is added to the parent correctly
-
- for f in files:
- sys.stdout.write("\033[A\033[300D\033[2K")
- self._scanDirUpdateFile(folder, dirPath, f)
- sys.stdout.write("\033[A\033[100D\033[2K")
- def findDuplicates(self):
- return [(sha1, list(filenames)) for sha1, filenames in self.files.items() if len(filenames) > 1]
-
-def main():
- try:
- files = cPickle.load(open(sys.argv[1]))
- except IOError:
- files = GlobalFileInfo()
-
- for dir in sys.argv[2:]:
- if dir[-1] == '/':
- dir = dir[:-1]
- files.scanDir(dir)
-
- cPickle.dump(files, open(sys.argv[1], 'wb'), 2)
- print "Done"
-
-### print files.files
-
-if __name__ == "__main__":
- main()