]> code.delx.au - bg-scripts/blobdiff - bin/findsame_file.py
Initial import
[bg-scripts] / bin / findsame_file.py
diff --git a/bin/findsame_file.py b/bin/findsame_file.py
new file mode 100755 (executable)
index 0000000..1a5ce8d
--- /dev/null
@@ -0,0 +1,175 @@
+#!/usr/bin/env python2.5
+
+MINFILE_SIZE = 1024
+FILEBUFFER_SIZE = 1024**2
+
+import os, sys, bisect
+
+from copy import copy
+from base64 import standard_b64encode as b64encode
+from collections import defaultdict
+import cPickle
+try:
+       import hashlib
+       def _getSha1(filename):
+               return hashlib.sha1()
+except ImportError:
+       import sha
+       def _getSha1(filename):
+               return sha.new()
+def getSha1(filename):
+       sha1 = _getSha1(filename)
+       f = file(filename, 'r')
+       data = f.read(FILEBUFFER_SIZE)
+       while data:
+               sha1.update(data)
+               data = f.read(FILEBUFFER_SIZE)
+       return b64encode(sha1.digest())
+
+try:
+       import psyco
+       psyco.full()
+except ImportError:
+       print >>sys.stderr, "WARNING: Could not load psyco"
+
+class DiskObject(object):
+       def __repr__(self):
+               return repr(self.getFullPath())
+       def __str__(self):
+               return self.getFullPath()
+       def __lt__(self, other):
+               if not hasattr(other, 'getFullPath'):
+                       raise NotImplemented()
+               return self.getFullPath() < other.getFullPath()
+       def __eq__(self, other):
+               if not hasattr(other, 'getFullPath'):
+                       raise NotImplemented()
+               return self.getFullPath() == other.getFullPath()
+       def __hash__(self):
+               return hash(self.getFullPath())
+
+class Folder(DiskObject):
+       def __init__(self, name, parent = None):
+               if name.find(os.path.sep) >= 0 and name != os.path.sep:
+                       print name
+                       parent_name, name = os.path.split(name)
+                       parent = Folder(parent_name, parent)
+
+               self.name = name
+               self.parent = parent
+               if parent:
+                       parent.addChild(self)
+               self.children = {}
+       def getFullPath(self):
+               folderStack = []
+               f = self
+               while f:
+                       folderStack.append(f.name)
+                       f = f.parent
+               return os.path.sep.join(reversed(folderStack))
+       def addChild(self, child):
+               self.children[child.name] = child
+
+def findDirectory(rootDir, dirName, createNonExistant = False):
+       dir = dirName.split(os.path.sep)[1:]
+       if dir == ['']:
+               dir = []
+       
+       ret = rootDir
+       for folderName in dir:
+               try:
+                       ret = ret.children[folderName]
+               except KeyError, e:
+                       if not createNonExistant:
+                               raise e
+                       ret = Folder(folderName, ret)
+
+       return ret
+
+class FileObject(DiskObject):
+       def __init__(self, name, folder):
+               self.name = name
+               self.folder = folder
+               statinfo = os.stat(self.getFullPath())
+               self.mtime_size = (statinfo.st_mtime, statinfo.st_size)
+       def getDiskID(self):
+               statinfo = os.stat(self.getFullPath())
+               return (statinfo.st_dev, statinfo.st_ino) # Identify the file
+       def get_mtime_size(self):
+               return self.mtime_size
+       def getFullPath(self):
+               return '%(folder)s/%(file)s' % { 'folder': self.folder.getFullPath(), 'file': self.name }
+
+class GlobalFileInfo(object):
+       def __init__(self):
+               self.files = defaultdict(list)
+               self.filelist = {}
+               self.root = Folder('')
+
+       def _scanDirUpdateFile(self, dirObject, dirPath, filename):
+               def printPath(word):
+                       print '%s "%s"' % (word, filename[-80:])
+               fullpath = os.path.join(dirPath, filename)
+               if os.path.islink(fullpath) or not os.path.isfile(fullpath):
+                       printPath('Skipping')
+                       return
+               try:
+                       file = FileObject(filename, dirObject)
+                       new_mtime_size = file.get_mtime_size()
+
+                       if file in self.filelist:
+                               if file.get_mtime_size() == self.filelist[file].get_mtime_size():
+                                       printPath('Skipping')
+                                       return
+                               old_sha1 = self.filelist[file].sha1
+                               del self.filelist[file]
+                               self.files[old_sha1].remove(file)
+
+                       if file.get_mtime_size()[1] < MINFILE_SIZE:
+                               printPath('Skipping')
+                               return
+                       printPath('Scanning')
+
+                       file.sha1 = getSha1(fullpath)
+                       self.files[file.sha1].append(file)
+                       self.filelist[file] = file
+               except IOError:
+                       print >>sys.stderr, 'WARNING: Could not get sha1 of "%s"\n' % (fullpath)
+
+       def scanDir(self, dirName):
+               root = findDirectory(self.root, dirName, createNonExistant = True)
+
+               for dirPath, dirs, files in os.walk(dirName):
+                       print 'Scanning directory "%s"\n' % dirPath
+                       folder = findDirectory(self.root, dirPath, createNonExistant = True)
+                       # Add the children Directories
+                       if '.svn' in dirs:
+                               dirs.remove('.svn')
+                       for d in dirs:
+                               Folder(d, folder) # As a side effect, this is added to the parent correctly
+
+                       for f in files:
+                               sys.stdout.write("\033[A\033[300D\033[2K")
+                               self._scanDirUpdateFile(folder, dirPath, f)
+                       sys.stdout.write("\033[A\033[100D\033[2K")
+       def findDuplicates(self):
+               return [(sha1, list(filenames)) for sha1, filenames in self.files.items() if len(filenames) > 1]
+
+def main():
+       try:
+               files = cPickle.load(open(sys.argv[1]))
+       except IOError:
+               files = GlobalFileInfo()
+
+       for dir in sys.argv[2:]:
+               if dir[-1] == '/':
+                       dir = dir[:-1]
+               files.scanDir(dir)
+       
+       cPickle.dump(files, open(sys.argv[1], 'wb'), 2)
+       print "Done"
+
+###    print files.files
+
+if __name__ == "__main__":
+       main()