--- /dev/null
+#!/usr/bin/env python2.5
+
+MINFILE_SIZE = 1024
+FILEBUFFER_SIZE = 1024**2
+APPLICATION_VERSION = '0.2'
+
+import os, sys, bisect
+
+import python24_adapter
+from copy import copy
+from base64 import standard_b64encode as b64encode
+from collections import defaultdict
+import cPickle
+
+try:
+ import hashlib
+ def _getSha1(filename):
+ return hashlib.sha1()
+except ImportError:
+ import sha
+ def _getSha1(filename):
+ return sha.new()
+def getSha1(filename):
+ if _sha1_cache.has_key(filename):
+ return b64encode(_sha1_cache[filename])
+
+ sha1 = _getSha1(filename)
+ f = file(filename, 'r')
+ data = f.read(FILEBUFFER_SIZE)
+ while data:
+ sha1.update(data)
+ data = f.read(FILEBUFFER_SIZE)
+
+ ret = sha1.digest()
+ _sha1_cache[filename] = ret
+ return b64encode(ret)
+
+try:
+ import psyco
+ psyco.full()
+except ImportError:
+ print >>sys.stderr, "WARNING: Could not load psyco"
+
+def __versionUpgrade0_1(input):
+ import base64
+ return '0.2', dict((filename, base64.b64decode(sha1hash)) for filename, sha1hash in input)
+
+def loadCache(filename = os.path.expanduser('~/.sha1_cache'), version = APPLICATION_VERSION):
+ global _sha1_cache
+ try:
+ cache_version, cache = cPickle.load(open(filename, 'rb'))
+ if cache_version == '0.1':
+ cache_version, cache = __versionUpgrade0_1(cache)
+
+ if cache_version != version:
+ raise Exception("Invalid Version")
+ print 'WARNING: Using the cache file "%s", sha1 hash may be old' % filename
+ except:
+ cache = {}
+ _sha1_cache = cache
+
+def storeCache(filename = os.path.expanduser('~/.sha1_cache'), version = APPLICATION_VERSION):
+ fd = open(filename, 'wb')
+ try:
+ cPickle.dump((version, _sha1_cache), fd)
+ finally:
+ fd.close()
+
+class GlobalFileInfo(object):
+ def __init__(self):
+ self.files = defaultdict(lambda : defaultdict(list))
+
+ def _scanDirUpdateFile(self, dirPath, filename):
+ def printPath(word):
+ print '%s "%s"' % (word, filename[-80:])
+ fullpath = os.path.abspath(os.path.join(dirPath, filename))
+ if os.path.islink(fullpath) or not os.path.isfile(fullpath):
+ printPath('Skipping')
+ return
+ try:
+ statInfo = os.stat(fullpath)
+
+ if statInfo.st_size < MINFILE_SIZE:
+ printPath('Skipping')
+ return
+ printPath('Scanning')
+
+ fileHash = getSha1(fullpath)
+ self.files[(fileHash, statInfo.st_size)][(statInfo.st_dev, statInfo.st_ino)].append(fullpath)
+ except IOError:
+ print >>sys.stderr, 'WARNING: Could not get sha1 of "%s"\n' % (fullpath)
+
+ def scanDir(self, dirName):
+ for dirPath, dirs, files in os.walk(dirName):
+ print 'Scanning directory "%s"\n' % dirPath
+ # Add the children Directories
+ if '.svn' in dirs:
+ dirs.remove('.svn')
+
+ for f in files:
+ sys.stdout.write("\033[A\033[300D\033[2K")
+ self._scanDirUpdateFile(dirPath, f)
+ sys.stdout.write("\033[A\033[100D\033[2K")
+ def findDuplicates(self):
+ return [(key, inodes) for key, inodes in self.files.items() if len(inodes) > 1]
+
+def prettyFormatDups(dups):
+ return '\n'.join( \
+ '%s\n\t%s' % (key, \
+ '\n\t'.join('%s: %s' % (inode_key, ', '.join(files)) for inode_key, files in inodes.items()) \
+ ) for key, inodes in dups \
+ )
+
+ ret = []
+ for key, inodes in dups:
+ section = []
+ for inode_key, files in inodes.items():
+ section.append('%s: %s' % (inode_key, ', '.join(files)))
+ ret.append('%s\n\t%s' % (key, '\n\t'.join(section)))
+
+ return '\n'.join(ret)
+
+
+def makeBashScript(dups, fd):
+ spaceSaved = 0
+ print >>fd, "#!/bin/bash"
+ print >>fd, '# This script was created automatically by "%s"' % __file__
+ # Print out a helper function
+ print >>fd
+ print >>fd, 'function doLink() {'
+ print >>fd, '\tINPUT_FILE="${1}"'
+ print >>fd, '\tshift'
+ print >>fd, '\tfor file in "$@" ; do'
+ print >>fd, '\t\tln "${INPUT_FILE}" "${file}"'
+ print >>fd, '\tdone'
+ print >>fd, '}'
+
+ for dup_key, inodes in dups:
+ print >>fd
+ print >>fd, '# Handling %s' % str(dup_key)
+ inodes_data = inodes.items()
+ inodes_data.sort(key = lambda x: len(x[1]), reverse = True)
+ for inode_key, files in inodes_data[1:]:
+ print >>fd, '# Removing files connected to inode %d on device %d' % (inode_key[1], inode_key[0])
+ print >>fd, 'rm -f "%s"' % '" "'.join(file for file in files)
+ fileToLink = inodes_data[0][1][0] # Get the first filename of the largest group of (already) linked files
+ print >>fd, '# Now link all the files together'
+ print >>fd, 'doLink "%s" "%s"' % (fileToLink, '" "'.join('" "'.join(files) for inode_key, files in inodes_data[1:]))
+ spaceSaved += sum(len(files) for inode_key, files in inodes_data[1:]) * dup_key[1]
+
+ print >>fd
+ print >>fd, '# Total space saved: %d B (%dK B) (%d MB)' % (spaceSaved, spaceSaved / 1024, spaceSaved / 1024**2)
+
+def main():
+ loadCache()
+ files = GlobalFileInfo()
+
+ for dir in sys.argv[2:]:
+ files.scanDir(dir)
+
+ storeCache()
+ print "Done."
+ try:
+ fd = open(sys.argv[1], 'wb')
+ makeBashScript(files.findDuplicates(), fd)
+ finally:
+ fd.close()
+
+if __name__ == "__main__":
+ main()