]>
code.delx.au - bg-scripts/blob - bin/findsame_file.py
1 #!/usr/bin/env python2.5
4 FILEBUFFER_SIZE
= 1024**2
9 from base64
import standard_b64encode
as b64encode
10 from collections
import defaultdict
14 def _getSha1(filename
):
18 def _getSha1(filename
):
20 def getSha1(filename
):
21 sha1
= _getSha1(filename
)
22 f
= file(filename
, 'r')
23 data
= f
.read(FILEBUFFER_SIZE
)
26 data
= f
.read(FILEBUFFER_SIZE
)
27 return b64encode(sha1
.digest())
33 print >>sys
.stderr
, "WARNING: Could not load psyco"
35 class DiskObject(object):
37 return repr(self
.getFullPath())
39 return self
.getFullPath()
40 def __lt__(self
, other
):
41 if not hasattr(other
, 'getFullPath'):
42 raise NotImplemented()
43 return self
.getFullPath() < other
.getFullPath()
44 def __eq__(self
, other
):
45 if not hasattr(other
, 'getFullPath'):
46 raise NotImplemented()
47 return self
.getFullPath() == other
.getFullPath()
49 return hash(self
.getFullPath())
51 class Folder(DiskObject
):
52 def __init__(self
, name
, parent
= None):
53 if name
.find(os
.path
.sep
) >= 0 and name
!= os
.path
.sep
:
55 parent_name
, name
= os
.path
.split(name
)
56 parent
= Folder(parent_name
, parent
)
63 def getFullPath(self
):
67 folderStack
.append(f
.name
)
69 return os
.path
.sep
.join(reversed(folderStack
))
70 def addChild(self
, child
):
71 self
.children
[child
.name
] = child
73 def findDirectory(rootDir
, dirName
, createNonExistant
= False):
74 dir = dirName
.split(os
.path
.sep
)[1:]
79 for folderName
in dir:
81 ret
= ret
.children
[folderName
]
83 if not createNonExistant
:
85 ret
= Folder(folderName
, ret
)
89 class FileObject(DiskObject
):
90 def __init__(self
, name
, folder
):
93 statinfo
= os
.stat(self
.getFullPath())
94 self
.mtime_size
= (statinfo
.st_mtime
, statinfo
.st_size
)
96 statinfo
= os
.stat(self
.getFullPath())
97 return (statinfo
.st_dev
, statinfo
.st_ino
) # Identify the file
98 def get_mtime_size(self
):
99 return self
.mtime_size
100 def getFullPath(self
):
101 return '%(folder)s/%(file)s' % { 'folder': self
.folder
.getFullPath(), 'file': self
.name
}
103 class GlobalFileInfo(object):
105 self
.files
= defaultdict(list)
107 self
.root
= Folder('')
109 def _scanDirUpdateFile(self
, dirObject
, dirPath
, filename
):
111 print '%s "%s"' % (word
, filename
[-80:])
112 fullpath
= os
.path
.join(dirPath
, filename
)
113 if os
.path
.islink(fullpath
) or not os
.path
.isfile(fullpath
):
114 printPath('Skipping')
117 file = FileObject(filename
, dirObject
)
118 new_mtime_size
= file.get_mtime_size()
120 if file in self
.filelist
:
121 if file.get_mtime_size() == self
.filelist
[file].get_mtime_size():
122 printPath('Skipping')
124 old_sha1
= self
.filelist
[file].sha1
125 del self
.filelist
[file]
126 self
.files
[old_sha1
].remove(file)
128 if file.get_mtime_size()[1] < MINFILE_SIZE
:
129 printPath('Skipping')
131 printPath('Scanning')
133 file.sha1
= getSha1(fullpath
)
134 self
.files
[file.sha1
].append(file)
135 self
.filelist
[file] = file
137 print >>sys
.stderr
, 'WARNING: Could not get sha1 of "%s"\n' % (fullpath
)
139 def scanDir(self
, dirName
):
140 root
= findDirectory(self
.root
, dirName
, createNonExistant
= True)
142 for dirPath
, dirs
, files
in os
.walk(dirName
):
143 print 'Scanning directory "%s"\n' % dirPath
144 folder
= findDirectory(self
.root
, dirPath
, createNonExistant
= True)
145 # Add the children Directories
149 Folder(d
, folder
) # As a side effect, this is added to the parent correctly
152 sys
.stdout
.write("\033[A\033[300D\033[2K")
153 self
._scanDirUpdateFile
(folder
, dirPath
, f
)
154 sys
.stdout
.write("\033[A\033[100D\033[2K")
155 def findDuplicates(self
):
156 return [(sha1
, list(filenames
)) for sha1
, filenames
in self
.files
.items() if len(filenames
) > 1]
160 files
= cPickle
.load(open(sys
.argv
[1]))
162 files
= GlobalFileInfo()
164 for dir in sys
.argv
[2:]:
169 cPickle
.dump(files
, open(sys
.argv
[1], 'wb'), 2)
172 ### print files.files
174 if __name__
== "__main__":