Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
| Download

📚 The CoCalc Library - books, templates and other resources

Views: 96165
License: OTHER
1
"""This module contains a code example related to
2
3
Think Python, 2nd Edition
4
by Allen Downey
5
http://thinkpython2.com
6
7
Copyright 2015 Allen Downey
8
9
License: http://creativecommons.org/licenses/by/4.0/
10
"""
11
12
from __future__ import print_function, division
13
14
import os
15
16
17
def walk(dirname):
18
"""Finds the names of all files in dirname and its subdirectories.
19
20
dirname: string name of directory
21
"""
22
names = []
23
if '__pycache__' in dirname:
24
return names
25
26
for name in os.listdir(dirname):
27
path = os.path.join(dirname, name)
28
29
if os.path.isfile(path):
30
names.append(path)
31
else:
32
names.extend(walk(path))
33
return names
34
35
36
def compute_checksum(filename):
37
"""Computes the MD5 checksum of the contents of a file.
38
39
filename: string
40
"""
41
cmd = 'md5sum ' + filename
42
return pipe(cmd)
43
44
45
def check_diff(name1, name2):
46
"""Computes the difference between the contents of two files.
47
48
name1, name2: string filenames
49
"""
50
cmd = 'diff %s %s' % (name1, name2)
51
return pipe(cmd)
52
53
54
def pipe(cmd):
55
"""Runs a command in a subprocess.
56
57
cmd: string Unix command
58
59
Returns (res, stat), the output of the subprocess and the exit status.
60
"""
61
# Note: os.popen is deprecated
62
# now, which means we are supposed to stop using it and start using
63
# the subprocess module. But for simple cases, I find
64
# subprocess more complicated than necessary. So I am going
65
# to keep using os.popen until they take it away.
66
67
fp = os.popen(cmd)
68
res = fp.read()
69
stat = fp.close()
70
assert stat is None
71
return res, stat
72
73
74
def compute_checksums(dirname, suffix):
75
"""Computes checksums for all files with the given suffix.
76
77
dirname: string name of directory to search
78
suffix: string suffix to match
79
80
Returns: map from checksum to list of files with that checksum
81
"""
82
names = walk(dirname)
83
84
d = {}
85
for name in names:
86
if name.endswith(suffix):
87
res, stat = compute_checksum(name)
88
checksum, _ = res.split()
89
90
if checksum in d:
91
d[checksum].append(name)
92
else:
93
d[checksum] = [name]
94
95
return d
96
97
98
def check_pairs(names):
99
"""Checks whether any in a list of files differs from the others.
100
101
names: list of string filenames
102
"""
103
for name1 in names:
104
for name2 in names:
105
if name1 < name2:
106
res, stat = check_diff(name1, name2)
107
if res:
108
return False
109
return True
110
111
112
def print_duplicates(d):
113
"""Checks for duplicate files.
114
115
Reports any files with the same checksum and checks whether they
116
are, in fact, identical.
117
118
d: map from checksum to list of files with that checksum
119
"""
120
for key, names in d.items():
121
if len(names) > 1:
122
print('The following files have the same checksum:')
123
for name in names:
124
print(name)
125
126
if check_pairs(names):
127
print('And they are identical.')
128
129
130
if __name__ == '__main__':
131
d = compute_checksums(dirname='.', suffix='.py')
132
print_duplicates(d)
133
134