+++ /dev/null
-#!/usr/bin/env python2.5
-
-MINFILE_SIZE = 1024
-FILEBUFFER_SIZE = 1024**2
-APPLICATION_VERSION = '0.2'
-
-import os, sys, bisect
-
-import python24_adapter
-from copy import copy
-from base64 import standard_b64encode as b64encode
-from collections import defaultdict
-import cPickle
-
-try:
- import hashlib
- def _getSha1(filename):
- return hashlib.sha1()
-except ImportError:
- import sha
- def _getSha1(filename):
- return sha.new()
-def getSha1(filename):
- if _sha1_cache.has_key(filename):
- return b64encode(_sha1_cache[filename])
-
- sha1 = _getSha1(filename)
- f = file(filename, 'r')
- data = f.read(FILEBUFFER_SIZE)
- while data:
- sha1.update(data)
- data = f.read(FILEBUFFER_SIZE)
-
- ret = sha1.digest()
- _sha1_cache[filename] = ret
- return b64encode(ret)
-
-try:
- import psyco
- psyco.full()
-except ImportError:
- print >>sys.stderr, "WARNING: Could not load psyco"
-
-def __versionUpgrade0_1(input):
- import base64
- return '0.2', dict((filename, base64.b64decode(sha1hash)) for filename, sha1hash in input)
-
-def loadCache(filename = os.path.expanduser('~/.sha1_cache'), version = APPLICATION_VERSION):
- global _sha1_cache
- try:
- cache_version, cache = cPickle.load(open(filename, 'rb'))
- if cache_version == '0.1':
- cache_version, cache = __versionUpgrade0_1(cache)
-
- if cache_version != version:
- raise Exception("Invalid Version")
- print 'WARNING: Using the cache file "%s", sha1 hash may be old' % filename
- except:
- cache = {}
- _sha1_cache = cache
-
-def storeCache(filename = os.path.expanduser('~/.sha1_cache'), version = APPLICATION_VERSION):
- fd = open(filename, 'wb')
- try:
- cPickle.dump((version, _sha1_cache), fd)
- finally:
- fd.close()
-
-class GlobalFileInfo(object):
- def __init__(self):
- self.files = defaultdict(lambda : defaultdict(list))
-
- def _scanDirUpdateFile(self, dirPath, filename):
- def printPath(word):
- print '%s "%s"' % (word, filename[-80:])
- fullpath = os.path.abspath(os.path.join(dirPath, filename))
- if os.path.islink(fullpath) or not os.path.isfile(fullpath):
- printPath('Skipping')
- return
- try:
- statInfo = os.stat(fullpath)
-
- if statInfo.st_size < MINFILE_SIZE:
- printPath('Skipping')
- return
- printPath('Scanning')
-
- fileHash = getSha1(fullpath)
- self.files[(fileHash, statInfo.st_size)][(statInfo.st_dev, statInfo.st_ino)].append(fullpath)
- except IOError:
- print >>sys.stderr, 'WARNING: Could not get sha1 of "%s"\n' % (fullpath)
-
- def scanDir(self, dirName):
- for dirPath, dirs, files in os.walk(dirName):
- print 'Scanning directory "%s"\n' % dirPath
- # Add the children Directories
- if '.svn' in dirs:
- dirs.remove('.svn')
-
- for f in files:
- sys.stdout.write("\033[A\033[300D\033[2K")
- self._scanDirUpdateFile(dirPath, f)
- sys.stdout.write("\033[A\033[100D\033[2K")
- def findDuplicates(self):
- return [(key, inodes) for key, inodes in self.files.items() if len(inodes) > 1]
-
-def prettyFormatDups(dups):
- return '\n'.join( \
- '%s\n\t%s' % (key, \
- '\n\t'.join('%s: %s' % (inode_key, ', '.join(files)) for inode_key, files in inodes.items()) \
- ) for key, inodes in dups \
- )
-
- ret = []
- for key, inodes in dups:
- section = []
- for inode_key, files in inodes.items():
- section.append('%s: %s' % (inode_key, ', '.join(files)))
- ret.append('%s\n\t%s' % (key, '\n\t'.join(section)))
-
- return '\n'.join(ret)
-
-
-def makeBashScript(dups, fd):
- spaceSaved = 0
- print >>fd, "#!/bin/bash"
- print >>fd, '# This script was created automatically by "%s"' % __file__
- # Print out a helper function
- print >>fd
- print >>fd, 'function doLink() {'
- print >>fd, '\tINPUT_FILE="${1}"'
- print >>fd, '\tshift'
- print >>fd, '\tfor file in "$@" ; do'
- print >>fd, '\t\tln "${INPUT_FILE}" "${file}"'
- print >>fd, '\tdone'
- print >>fd, '}'
-
- for dup_key, inodes in dups:
- print >>fd
- print >>fd, '# Handling %s' % str(dup_key)
- inodes_data = inodes.items()
- inodes_data.sort(key = lambda x: len(x[1]), reverse = True)
- for inode_key, files in inodes_data[1:]:
- print >>fd, '# Removing files connected to inode %d on device %d' % (inode_key[1], inode_key[0])
- print >>fd, 'rm -f "%s"' % '" "'.join(file for file in files)
- fileToLink = inodes_data[0][1][0] # Get the first filename of the largest group of (already) linked files
- print >>fd, '# Now link all the files together'
- print >>fd, 'doLink "%s" "%s"' % (fileToLink, '" "'.join('" "'.join(files) for inode_key, files in inodes_data[1:]))
- spaceSaved += sum(len(files) for inode_key, files in inodes_data[1:]) * dup_key[1]
-
- print >>fd
- print >>fd, '# Total space saved: %d B (%dK B) (%d MB)' % (spaceSaved, spaceSaved / 1024, spaceSaved / 1024**2)
-
-def main():
- loadCache()
- files = GlobalFileInfo()
-
- for dir in sys.argv[2:]:
- files.scanDir(dir)
-
- storeCache()
- print "Done."
- try:
- fd = open(sys.argv[1], 'wb')
- makeBashScript(files.findDuplicates(), fd)
- finally:
- fd.close()
-
-if __name__ == "__main__":
- main()