]>
code.delx.au - bg-scripts/blob - bin/findsame_file_new.py
293b9fbc8adf497a2c412458014d9ba1fc5ba573
1 #!/usr/bin/env python2.5
4 FILEBUFFER_SIZE
= 1024**2
5 APPLICATION_VERSION
= '0.2'
9 import python24_adapter
11 from base64
import standard_b64encode
as b64encode
12 from collections
import defaultdict
17 def _getSha1(filename
):
21 def _getSha1(filename
):
23 def getSha1(filename
):
24 if _sha1_cache
.has_key(filename
):
25 return b64encode(_sha1_cache
[filename
])
27 sha1
= _getSha1(filename
)
28 f
= file(filename
, 'r')
29 data
= f
.read(FILEBUFFER_SIZE
)
32 data
= f
.read(FILEBUFFER_SIZE
)
35 _sha1_cache
[filename
] = ret
42 print >>sys
.stderr
, "WARNING: Could not load psyco"
44 def __versionUpgrade0_1(input):
46 return '0.2', dict((filename
, base64
.b64decode(sha1hash
)) for filename
, sha1hash
in input)
48 def loadCache(filename
= os
.path
.expanduser('~/.sha1_cache'), version
= APPLICATION_VERSION
):
51 cache_version
, cache
= cPickle
.load(open(filename
, 'rb'))
52 if cache_version
== '0.1':
53 cache_version
, cache
= __versionUpgrade0_1(cache
)
55 if cache_version
!= version
:
56 raise Exception("Invalid Version")
57 print 'WARNING: Using the cache file "%s", sha1 hash may be old' % filename
62 def storeCache(filename
= os
.path
.expanduser('~/.sha1_cache'), version
= APPLICATION_VERSION
):
63 fd
= open(filename
, 'wb')
65 cPickle
.dump((version
, _sha1_cache
), fd
)
69 class GlobalFileInfo(object):
71 self
.files
= defaultdict(lambda : defaultdict(list))
73 def _scanDirUpdateFile(self
, dirPath
, filename
):
75 print '%s "%s"' % (word
, filename
[-80:])
76 fullpath
= os
.path
.abspath(os
.path
.join(dirPath
, filename
))
77 if os
.path
.islink(fullpath
) or not os
.path
.isfile(fullpath
):
81 statInfo
= os
.stat(fullpath
)
83 if statInfo
.st_size
< MINFILE_SIZE
:
88 fileHash
= getSha1(fullpath
)
89 self
.files
[(fileHash
, statInfo
.st_size
)][(statInfo
.st_dev
, statInfo
.st_ino
)].append(fullpath
)
91 print >>sys
.stderr
, 'WARNING: Could not get sha1 of "%s"\n' % (fullpath
)
93 def scanDir(self
, dirName
):
94 for dirPath
, dirs
, files
in os
.walk(dirName
):
95 print 'Scanning directory "%s"\n' % dirPath
96 # Add the children Directories
101 sys
.stdout
.write("\033[A\033[300D\033[2K")
102 self
._scanDirUpdateFile
(dirPath
, f
)
103 sys
.stdout
.write("\033[A\033[100D\033[2K")
104 def findDuplicates(self
):
105 return [(key
, inodes
) for key
, inodes
in self
.files
.items() if len(inodes
) > 1]
107 def prettyFormatDups(dups
):
110 '\n\t'.join('%s: %s' % (inode_key
, ', '.join(files
)) for inode_key
, files
in inodes
.items()) \
111 ) for key
, inodes
in dups \
115 for key
, inodes
in dups
:
117 for inode_key
, files
in inodes
.items():
118 section
.append('%s: %s' % (inode_key
, ', '.join(files
)))
119 ret
.append('%s\n\t%s' % (key
, '\n\t'.join(section
)))
121 return '\n'.join(ret
)
124 def makeBashScript(dups
, fd
):
126 print >>fd
, "#!/bin/bash"
127 print >>fd
, '# This script was created automatically by "%s"' % __file__
128 # Print out a helper function
130 print >>fd
, 'function doLink() {'
131 print >>fd
, '\tINPUT_FILE="${1}"'
132 print >>fd
, '\tshift'
133 print >>fd
, '\tfor file in "$@" ; do'
134 print >>fd
, '\t\tln "${INPUT_FILE}" "${file}"'
138 for dup_key
, inodes
in dups
:
140 print >>fd
, '# Handling %s' % str(dup_key
)
141 inodes_data
= inodes
.items()
142 inodes_data
.sort(key
= lambda x
: len(x
[1]), reverse
= True)
143 for inode_key
, files
in inodes_data
[1:]:
144 print >>fd
, '# Removing files connected to inode %d on device %d' % (inode_key
[1], inode_key
[0])
145 print >>fd
, 'rm -f "%s"' % '" "'.join(file for file in files
)
146 fileToLink
= inodes_data
[0][1][0] # Get the first filename of the largest group of (already) linked files
147 print >>fd
, '# Now link all the files together'
148 print >>fd
, 'doLink "%s" "%s"' % (fileToLink
, '" "'.join('" "'.join(files
) for inode_key
, files
in inodes_data
[1:]))
149 spaceSaved
+= sum(len(files
) for inode_key
, files
in inodes_data
[1:]) * dup_key
[1]
152 print >>fd
, '# Total space saved: %d B (%dK B) (%d MB)' % (spaceSaved
, spaceSaved
/ 1024, spaceSaved
/ 1024**2)
156 files
= GlobalFileInfo()
158 for dir in sys
.argv
[2:]:
164 fd
= open(sys
.argv
[1], 'wb')
165 makeBashScript(files
.findDuplicates(), fd
)
169 if __name__
== "__main__":