]> code.delx.au - bg-scripts/blob - bin/findsame_file_new.py
RandomBG: Make Listener non-writeable
[bg-scripts] / bin / findsame_file_new.py
1 #!/usr/bin/env python2.5
2
3 MINFILE_SIZE = 1024
4 FILEBUFFER_SIZE = 1024**2
5 APPLICATION_VERSION = '0.2'
6
7 import os, sys, bisect
8
9 import python24_adapter
10 from copy import copy
11 from base64 import standard_b64encode as b64encode
12 from collections import defaultdict
13 import cPickle
14
15 try:
16 import hashlib
17 def _getSha1(filename):
18 return hashlib.sha1()
19 except ImportError:
20 import sha
21 def _getSha1(filename):
22 return sha.new()
23 def getSha1(filename):
24 if _sha1_cache.has_key(filename):
25 return b64encode(_sha1_cache[filename])
26
27 sha1 = _getSha1(filename)
28 f = file(filename, 'r')
29 data = f.read(FILEBUFFER_SIZE)
30 while data:
31 sha1.update(data)
32 data = f.read(FILEBUFFER_SIZE)
33
34 ret = sha1.digest()
35 _sha1_cache[filename] = ret
36 return b64encode(ret)
37
38 try:
39 import psyco
40 psyco.full()
41 except ImportError:
42 print >>sys.stderr, "WARNING: Could not load psyco"
43
44 def __versionUpgrade0_1(input):
45 import base64
46 return '0.2', dict((filename, base64.b64decode(sha1hash)) for filename, sha1hash in input)
47
48 def loadCache(filename = os.path.expanduser('~/.sha1_cache'), version = APPLICATION_VERSION):
49 global _sha1_cache
50 try:
51 cache_version, cache = cPickle.load(open(filename, 'rb'))
52 if cache_version == '0.1':
53 cache_version, cache = __versionUpgrade0_1(cache)
54
55 if cache_version != version:
56 raise Exception("Invalid Version")
57 print 'WARNING: Using the cache file "%s", sha1 hash may be old' % filename
58 except:
59 cache = {}
60 _sha1_cache = cache
61
62 def storeCache(filename = os.path.expanduser('~/.sha1_cache'), version = APPLICATION_VERSION):
63 fd = open(filename, 'wb')
64 try:
65 cPickle.dump((version, _sha1_cache), fd)
66 finally:
67 fd.close()
68
69 class GlobalFileInfo(object):
70 def __init__(self):
71 self.files = defaultdict(lambda : defaultdict(list))
72
73 def _scanDirUpdateFile(self, dirPath, filename):
74 def printPath(word):
75 print '%s "%s"' % (word, filename[-80:])
76 fullpath = os.path.abspath(os.path.join(dirPath, filename))
77 if os.path.islink(fullpath) or not os.path.isfile(fullpath):
78 printPath('Skipping')
79 return
80 try:
81 statInfo = os.stat(fullpath)
82
83 if statInfo.st_size < MINFILE_SIZE:
84 printPath('Skipping')
85 return
86 printPath('Scanning')
87
88 fileHash = getSha1(fullpath)
89 self.files[(fileHash, statInfo.st_size)][(statInfo.st_dev, statInfo.st_ino)].append(fullpath)
90 except IOError:
91 print >>sys.stderr, 'WARNING: Could not get sha1 of "%s"\n' % (fullpath)
92
93 def scanDir(self, dirName):
94 for dirPath, dirs, files in os.walk(dirName):
95 print 'Scanning directory "%s"\n' % dirPath
96 # Add the children Directories
97 if '.svn' in dirs:
98 dirs.remove('.svn')
99
100 for f in files:
101 sys.stdout.write("\033[A\033[300D\033[2K")
102 self._scanDirUpdateFile(dirPath, f)
103 sys.stdout.write("\033[A\033[100D\033[2K")
104 def findDuplicates(self):
105 return [(key, inodes) for key, inodes in self.files.items() if len(inodes) > 1]
106
107 def prettyFormatDups(dups):
108 return '\n'.join( \
109 '%s\n\t%s' % (key, \
110 '\n\t'.join('%s: %s' % (inode_key, ', '.join(files)) for inode_key, files in inodes.items()) \
111 ) for key, inodes in dups \
112 )
113
114 ret = []
115 for key, inodes in dups:
116 section = []
117 for inode_key, files in inodes.items():
118 section.append('%s: %s' % (inode_key, ', '.join(files)))
119 ret.append('%s\n\t%s' % (key, '\n\t'.join(section)))
120
121 return '\n'.join(ret)
122
123
124 def makeBashScript(dups, fd):
125 spaceSaved = 0
126 print >>fd, "#!/bin/bash"
127 print >>fd, '# This script was created automatically by "%s"' % __file__
128 # Print out a helper function
129 print >>fd
130 print >>fd, 'function doLink() {'
131 print >>fd, '\tINPUT_FILE="${1}"'
132 print >>fd, '\tshift'
133 print >>fd, '\tfor file in "$@" ; do'
134 print >>fd, '\t\tln "${INPUT_FILE}" "${file}"'
135 print >>fd, '\tdone'
136 print >>fd, '}'
137
138 for dup_key, inodes in dups:
139 print >>fd
140 print >>fd, '# Handling %s' % str(dup_key)
141 inodes_data = inodes.items()
142 inodes_data.sort(key = lambda x: len(x[1]), reverse = True)
143 for inode_key, files in inodes_data[1:]:
144 print >>fd, '# Removing files connected to inode %d on device %d' % (inode_key[1], inode_key[0])
145 print >>fd, 'rm -f "%s"' % '" "'.join(file for file in files)
146 fileToLink = inodes_data[0][1][0] # Get the first filename of the largest group of (already) linked files
147 print >>fd, '# Now link all the files together'
148 print >>fd, 'doLink "%s" "%s"' % (fileToLink, '" "'.join('" "'.join(files) for inode_key, files in inodes_data[1:]))
149 spaceSaved += sum(len(files) for inode_key, files in inodes_data[1:]) * dup_key[1]
150
151 print >>fd
152 print >>fd, '# Total space saved: %d B (%dK B) (%d MB)' % (spaceSaved, spaceSaved / 1024, spaceSaved / 1024**2)
153
154 def main():
155 loadCache()
156 files = GlobalFileInfo()
157
158 for dir in sys.argv[2:]:
159 files.scanDir(dir)
160
161 storeCache()
162 print "Done."
163 try:
164 fd = open(sys.argv[1], 'wb')
165 makeBashScript(files.findDuplicates(), fd)
166 finally:
167 fd.close()
168
169 if __name__ == "__main__":
170 main()