]> code.delx.au - bg-scripts/blob - bin/findsame_file.py
1a5ce8ddff153794a8518d42d2c786aecca60521
[bg-scripts] / bin / findsame_file.py
1 #!/usr/bin/env python2.5
2
3 MINFILE_SIZE = 1024
4 FILEBUFFER_SIZE = 1024**2
5
6 import os, sys, bisect
7
8 from copy import copy
9 from base64 import standard_b64encode as b64encode
10 from collections import defaultdict
11 import cPickle
12 try:
13 import hashlib
14 def _getSha1(filename):
15 return hashlib.sha1()
16 except ImportError:
17 import sha
18 def _getSha1(filename):
19 return sha.new()
20 def getSha1(filename):
21 sha1 = _getSha1(filename)
22 f = file(filename, 'r')
23 data = f.read(FILEBUFFER_SIZE)
24 while data:
25 sha1.update(data)
26 data = f.read(FILEBUFFER_SIZE)
27 return b64encode(sha1.digest())
28
29 try:
30 import psyco
31 psyco.full()
32 except ImportError:
33 print >>sys.stderr, "WARNING: Could not load psyco"
34
35 class DiskObject(object):
36 def __repr__(self):
37 return repr(self.getFullPath())
38 def __str__(self):
39 return self.getFullPath()
40 def __lt__(self, other):
41 if not hasattr(other, 'getFullPath'):
42 raise NotImplemented()
43 return self.getFullPath() < other.getFullPath()
44 def __eq__(self, other):
45 if not hasattr(other, 'getFullPath'):
46 raise NotImplemented()
47 return self.getFullPath() == other.getFullPath()
48 def __hash__(self):
49 return hash(self.getFullPath())
50
51 class Folder(DiskObject):
52 def __init__(self, name, parent = None):
53 if name.find(os.path.sep) >= 0 and name != os.path.sep:
54 print name
55 parent_name, name = os.path.split(name)
56 parent = Folder(parent_name, parent)
57
58 self.name = name
59 self.parent = parent
60 if parent:
61 parent.addChild(self)
62 self.children = {}
63 def getFullPath(self):
64 folderStack = []
65 f = self
66 while f:
67 folderStack.append(f.name)
68 f = f.parent
69 return os.path.sep.join(reversed(folderStack))
70 def addChild(self, child):
71 self.children[child.name] = child
72
73 def findDirectory(rootDir, dirName, createNonExistant = False):
74 dir = dirName.split(os.path.sep)[1:]
75 if dir == ['']:
76 dir = []
77
78 ret = rootDir
79 for folderName in dir:
80 try:
81 ret = ret.children[folderName]
82 except KeyError, e:
83 if not createNonExistant:
84 raise e
85 ret = Folder(folderName, ret)
86
87 return ret
88
89 class FileObject(DiskObject):
90 def __init__(self, name, folder):
91 self.name = name
92 self.folder = folder
93 statinfo = os.stat(self.getFullPath())
94 self.mtime_size = (statinfo.st_mtime, statinfo.st_size)
95 def getDiskID(self):
96 statinfo = os.stat(self.getFullPath())
97 return (statinfo.st_dev, statinfo.st_ino) # Identify the file
98 def get_mtime_size(self):
99 return self.mtime_size
100 def getFullPath(self):
101 return '%(folder)s/%(file)s' % { 'folder': self.folder.getFullPath(), 'file': self.name }
102
103 class GlobalFileInfo(object):
104 def __init__(self):
105 self.files = defaultdict(list)
106 self.filelist = {}
107 self.root = Folder('')
108
109 def _scanDirUpdateFile(self, dirObject, dirPath, filename):
110 def printPath(word):
111 print '%s "%s"' % (word, filename[-80:])
112 fullpath = os.path.join(dirPath, filename)
113 if os.path.islink(fullpath) or not os.path.isfile(fullpath):
114 printPath('Skipping')
115 return
116 try:
117 file = FileObject(filename, dirObject)
118 new_mtime_size = file.get_mtime_size()
119
120 if file in self.filelist:
121 if file.get_mtime_size() == self.filelist[file].get_mtime_size():
122 printPath('Skipping')
123 return
124 old_sha1 = self.filelist[file].sha1
125 del self.filelist[file]
126 self.files[old_sha1].remove(file)
127
128 if file.get_mtime_size()[1] < MINFILE_SIZE:
129 printPath('Skipping')
130 return
131 printPath('Scanning')
132
133 file.sha1 = getSha1(fullpath)
134 self.files[file.sha1].append(file)
135 self.filelist[file] = file
136 except IOError:
137 print >>sys.stderr, 'WARNING: Could not get sha1 of "%s"\n' % (fullpath)
138
139 def scanDir(self, dirName):
140 root = findDirectory(self.root, dirName, createNonExistant = True)
141
142 for dirPath, dirs, files in os.walk(dirName):
143 print 'Scanning directory "%s"\n' % dirPath
144 folder = findDirectory(self.root, dirPath, createNonExistant = True)
145 # Add the children Directories
146 if '.svn' in dirs:
147 dirs.remove('.svn')
148 for d in dirs:
149 Folder(d, folder) # As a side effect, this is added to the parent correctly
150
151 for f in files:
152 sys.stdout.write("\033[A\033[300D\033[2K")
153 self._scanDirUpdateFile(folder, dirPath, f)
154 sys.stdout.write("\033[A\033[100D\033[2K")
155 def findDuplicates(self):
156 return [(sha1, list(filenames)) for sha1, filenames in self.files.items() if len(filenames) > 1]
157
158 def main():
159 try:
160 files = cPickle.load(open(sys.argv[1]))
161 except IOError:
162 files = GlobalFileInfo()
163
164 for dir in sys.argv[2:]:
165 if dir[-1] == '/':
166 dir = dir[:-1]
167 files.scanDir(dir)
168
169 cPickle.dump(files, open(sys.argv[1], 'wb'), 2)
170 print "Done"
171
172 ### print files.files
173
174 if __name__ == "__main__":
175 main()