]> code.delx.au - bg-scripts/blobdiff - bin/findsame_file_new.py
Initial import
[bg-scripts] / bin / findsame_file_new.py
diff --git a/bin/findsame_file_new.py b/bin/findsame_file_new.py
new file mode 100755 (executable)
index 0000000..293b9fb
--- /dev/null
@@ -0,0 +1,170 @@
+#!/usr/bin/env python2.5
+
+MINFILE_SIZE = 1024
+FILEBUFFER_SIZE = 1024**2
+APPLICATION_VERSION = '0.2'
+
+import os, sys, bisect
+
+import python24_adapter
+from copy import copy
+from base64 import standard_b64encode as b64encode
+from collections import defaultdict
+import cPickle
+
+try:
+       import hashlib
+       def _getSha1(filename):
+               return hashlib.sha1()
+except ImportError:
+       import sha
+       def _getSha1(filename):
+               return sha.new()
+def getSha1(filename):
+       if _sha1_cache.has_key(filename):
+               return b64encode(_sha1_cache[filename])
+
+       sha1 = _getSha1(filename)
+       f = file(filename, 'r')
+       data = f.read(FILEBUFFER_SIZE)
+       while data:
+               sha1.update(data)
+               data = f.read(FILEBUFFER_SIZE)
+
+       ret = sha1.digest()
+       _sha1_cache[filename] = ret
+       return b64encode(ret)
+
+try:
+       import psyco
+       psyco.full()
+except ImportError:
+       print >>sys.stderr, "WARNING: Could not load psyco"
+
+def __versionUpgrade0_1(input):
+       import base64
+       return '0.2', dict((filename, base64.b64decode(sha1hash)) for filename, sha1hash in input)
+
+def loadCache(filename = os.path.expanduser('~/.sha1_cache'), version = APPLICATION_VERSION):
+       global _sha1_cache
+       try:
+               cache_version, cache = cPickle.load(open(filename, 'rb'))
+               if cache_version == '0.1':
+                       cache_version, cache = __versionUpgrade0_1(cache)
+
+               if cache_version != version:
+                       raise Exception("Invalid Version")
+               print 'WARNING: Using the cache file "%s", sha1 hash may be old' % filename
+       except:
+               cache = {}
+       _sha1_cache = cache
+
+def storeCache(filename = os.path.expanduser('~/.sha1_cache'), version = APPLICATION_VERSION):
+       fd = open(filename, 'wb')
+       try:
+               cPickle.dump((version, _sha1_cache), fd)
+       finally:
+               fd.close()
+               
+class GlobalFileInfo(object):
+       def __init__(self):
+               self.files = defaultdict(lambda : defaultdict(list))
+
+       def _scanDirUpdateFile(self, dirPath, filename):
+               def printPath(word):
+                       print '%s "%s"' % (word, filename[-80:])
+               fullpath = os.path.abspath(os.path.join(dirPath, filename))
+               if os.path.islink(fullpath) or not os.path.isfile(fullpath):
+                       printPath('Skipping')
+                       return
+               try:
+                       statInfo = os.stat(fullpath)
+
+                       if statInfo.st_size < MINFILE_SIZE:
+                               printPath('Skipping')
+                               return
+                       printPath('Scanning')
+
+                       fileHash = getSha1(fullpath)
+                       self.files[(fileHash, statInfo.st_size)][(statInfo.st_dev, statInfo.st_ino)].append(fullpath)
+               except IOError:
+                       print >>sys.stderr, 'WARNING: Could not get sha1 of "%s"\n' % (fullpath)
+
+       def scanDir(self, dirName):
+               for dirPath, dirs, files in os.walk(dirName):
+                       print 'Scanning directory "%s"\n' % dirPath
+                       # Add the children Directories
+                       if '.svn' in dirs:
+                               dirs.remove('.svn')
+
+                       for f in files:
+                               sys.stdout.write("\033[A\033[300D\033[2K")
+                               self._scanDirUpdateFile(dirPath, f)
+                       sys.stdout.write("\033[A\033[100D\033[2K")
+       def findDuplicates(self):
+               return [(key, inodes) for key, inodes in self.files.items() if len(inodes) > 1]
+
+def prettyFormatDups(dups):
+       return '\n'.join( \
+                       '%s\n\t%s' % (key, \
+                               '\n\t'.join('%s: %s' % (inode_key, ', '.join(files)) for inode_key, files in inodes.items()) \
+                               ) for key, inodes in dups \
+                       )
+
+       ret = []
+       for key, inodes in dups:
+               section = []
+               for inode_key, files in inodes.items():
+                       section.append('%s: %s' % (inode_key, ', '.join(files)))
+               ret.append('%s\n\t%s' % (key, '\n\t'.join(section)))
+       
+       return '\n'.join(ret)
+
+       
+def makeBashScript(dups, fd):
+       spaceSaved = 0
+       print >>fd, "#!/bin/bash"
+       print >>fd, '# This script was created automatically by "%s"' % __file__
+       # Print out a helper function
+       print >>fd
+       print >>fd, 'function doLink() {'
+       print >>fd, '\tINPUT_FILE="${1}"'
+       print >>fd, '\tshift'
+       print >>fd, '\tfor file in "$@" ; do'
+       print >>fd, '\t\tln "${INPUT_FILE}" "${file}"'
+       print >>fd, '\tdone'
+       print >>fd, '}'
+
+       for dup_key, inodes in dups:
+               print >>fd
+               print >>fd, '# Handling %s' % str(dup_key)
+               inodes_data = inodes.items()
+               inodes_data.sort(key = lambda x: len(x[1]), reverse = True)
+               for inode_key, files in inodes_data[1:]:
+                       print >>fd, '# Removing files connected to inode %d on device %d' % (inode_key[1], inode_key[0])
+                       print >>fd, 'rm -f "%s"' % '" "'.join(file for file in files)
+               fileToLink = inodes_data[0][1][0] # Get the first filename of the largest group of (already) linked files
+               print >>fd, '# Now link all the files together'
+               print >>fd, 'doLink "%s" "%s"' % (fileToLink, '" "'.join('" "'.join(files) for inode_key, files in inodes_data[1:]))
+               spaceSaved += sum(len(files) for inode_key, files in inodes_data[1:]) * dup_key[1]
+       
+       print >>fd
+       print >>fd, '# Total space saved: %d B (%dK B) (%d MB)' % (spaceSaved, spaceSaved / 1024, spaceSaved / 1024**2)
+
+def main():
+       loadCache()
+       files = GlobalFileInfo()
+
+       for dir in sys.argv[2:]:
+               files.scanDir(dir)
+
+       storeCache()
+       print "Done."
+       try:
+               fd = open(sys.argv[1], 'wb')
+               makeBashScript(files.findDuplicates(), fd)
+       finally:
+               fd.close()
+       
+if __name__ == "__main__":
+       main()