]> code.delx.au - bg-scripts/blobdiff - bin/findsame_file.py
Removing a lot of unused libraries
[bg-scripts] / bin / findsame_file.py
diff --git a/bin/findsame_file.py b/bin/findsame_file.py
deleted file mode 100755 (executable)
index 1a5ce8d..0000000
+++ /dev/null
@@ -1,175 +0,0 @@
-#!/usr/bin/env python2.5
-
-MINFILE_SIZE = 1024
-FILEBUFFER_SIZE = 1024**2
-
-import os, sys, bisect
-
-from copy import copy
-from base64 import standard_b64encode as b64encode
-from collections import defaultdict
-import cPickle
-try:
-       import hashlib
-       def _getSha1(filename):
-               return hashlib.sha1()
-except ImportError:
-       import sha
-       def _getSha1(filename):
-               return sha.new()
-def getSha1(filename):
-       sha1 = _getSha1(filename)
-       f = file(filename, 'r')
-       data = f.read(FILEBUFFER_SIZE)
-       while data:
-               sha1.update(data)
-               data = f.read(FILEBUFFER_SIZE)
-       return b64encode(sha1.digest())
-
-try:
-       import psyco
-       psyco.full()
-except ImportError:
-       print >>sys.stderr, "WARNING: Could not load psyco"
-
-class DiskObject(object):
-       def __repr__(self):
-               return repr(self.getFullPath())
-       def __str__(self):
-               return self.getFullPath()
-       def __lt__(self, other):
-               if not hasattr(other, 'getFullPath'):
-                       raise NotImplemented()
-               return self.getFullPath() < other.getFullPath()
-       def __eq__(self, other):
-               if not hasattr(other, 'getFullPath'):
-                       raise NotImplemented()
-               return self.getFullPath() == other.getFullPath()
-       def __hash__(self):
-               return hash(self.getFullPath())
-
-class Folder(DiskObject):
-       def __init__(self, name, parent = None):
-               if name.find(os.path.sep) >= 0 and name != os.path.sep:
-                       print name
-                       parent_name, name = os.path.split(name)
-                       parent = Folder(parent_name, parent)
-
-               self.name = name
-               self.parent = parent
-               if parent:
-                       parent.addChild(self)
-               self.children = {}
-       def getFullPath(self):
-               folderStack = []
-               f = self
-               while f:
-                       folderStack.append(f.name)
-                       f = f.parent
-               return os.path.sep.join(reversed(folderStack))
-       def addChild(self, child):
-               self.children[child.name] = child
-
-def findDirectory(rootDir, dirName, createNonExistant = False):
-       dir = dirName.split(os.path.sep)[1:]
-       if dir == ['']:
-               dir = []
-       
-       ret = rootDir
-       for folderName in dir:
-               try:
-                       ret = ret.children[folderName]
-               except KeyError, e:
-                       if not createNonExistant:
-                               raise e
-                       ret = Folder(folderName, ret)
-
-       return ret
-
-class FileObject(DiskObject):
-       def __init__(self, name, folder):
-               self.name = name
-               self.folder = folder
-               statinfo = os.stat(self.getFullPath())
-               self.mtime_size = (statinfo.st_mtime, statinfo.st_size)
-       def getDiskID(self):
-               statinfo = os.stat(self.getFullPath())
-               return (statinfo.st_dev, statinfo.st_ino) # Identify the file
-       def get_mtime_size(self):
-               return self.mtime_size
-       def getFullPath(self):
-               return '%(folder)s/%(file)s' % { 'folder': self.folder.getFullPath(), 'file': self.name }
-
-class GlobalFileInfo(object):
-       def __init__(self):
-               self.files = defaultdict(list)
-               self.filelist = {}
-               self.root = Folder('')
-
-       def _scanDirUpdateFile(self, dirObject, dirPath, filename):
-               def printPath(word):
-                       print '%s "%s"' % (word, filename[-80:])
-               fullpath = os.path.join(dirPath, filename)
-               if os.path.islink(fullpath) or not os.path.isfile(fullpath):
-                       printPath('Skipping')
-                       return
-               try:
-                       file = FileObject(filename, dirObject)
-                       new_mtime_size = file.get_mtime_size()
-
-                       if file in self.filelist:
-                               if file.get_mtime_size() == self.filelist[file].get_mtime_size():
-                                       printPath('Skipping')
-                                       return
-                               old_sha1 = self.filelist[file].sha1
-                               del self.filelist[file]
-                               self.files[old_sha1].remove(file)
-
-                       if file.get_mtime_size()[1] < MINFILE_SIZE:
-                               printPath('Skipping')
-                               return
-                       printPath('Scanning')
-
-                       file.sha1 = getSha1(fullpath)
-                       self.files[file.sha1].append(file)
-                       self.filelist[file] = file
-               except IOError:
-                       print >>sys.stderr, 'WARNING: Could not get sha1 of "%s"\n' % (fullpath)
-
-       def scanDir(self, dirName):
-               root = findDirectory(self.root, dirName, createNonExistant = True)
-
-               for dirPath, dirs, files in os.walk(dirName):
-                       print 'Scanning directory "%s"\n' % dirPath
-                       folder = findDirectory(self.root, dirPath, createNonExistant = True)
-                       # Add the children Directories
-                       if '.svn' in dirs:
-                               dirs.remove('.svn')
-                       for d in dirs:
-                               Folder(d, folder) # As a side effect, this is added to the parent correctly
-
-                       for f in files:
-                               sys.stdout.write("\033[A\033[300D\033[2K")
-                               self._scanDirUpdateFile(folder, dirPath, f)
-                       sys.stdout.write("\033[A\033[100D\033[2K")
-       def findDuplicates(self):
-               return [(sha1, list(filenames)) for sha1, filenames in self.files.items() if len(filenames) > 1]
-
-def main():
-       try:
-               files = cPickle.load(open(sys.argv[1]))
-       except IOError:
-               files = GlobalFileInfo()
-
-       for dir in sys.argv[2:]:
-               if dir[-1] == '/':
-                       dir = dir[:-1]
-               files.scanDir(dir)
-       
-       cPickle.dump(files, open(sys.argv[1], 'wb'), 2)
-       print "Done"
-
-###    print files.files
-
-if __name__ == "__main__":
-       main()