[Evolution] Duplicate Messages

Dan Stromberg strombrg@dcs.nac.uci.edu
02 Sep 2003 13:13:12 -0700


--=-gGjkL+Zm6tv/vl/DJwP4
Content-Type: multipart/mixed; boundary="=-p+EhcZ9+mMZHml6vzUHm"


--=-p+EhcZ9+mMZHml6vzUHm
Content-Type: text/plain
Content-Transfer-Encoding: quoted-printable

On Sun, 2003-08-31 at 15:16, mpierce wrote:
> Is there a way to find duplicate messages in a specific folder?
>=20
>=20
> _______________________________________________
> evolution maillist  -  evolution@lists.ximian.com
> http://lists.ximian.com/mailman/listinfo/evolution

You could import the messages from the folder into an MH folder, and
then run "classify" on them.  You may have to grep out some headers
first though.

If classify is hard to find, I'm attaching my equivs program, which does
much the same thing.  classify has more bells and whistles, but equivs
tends to be fast on medium sized collections of files.  classify is fine
for small collections.  I keep thinking I should add an algorithm to
equivs to make it work well on large collections, but I keep not finding
an actual need for it.  :)

This was written back before I knew python very well.  So it's probably
not code to study and emulate.

--=20
Dan Stromberg DCS/NACS/UCI <strombrg@dcs.nac.uci.edu>


--=-p+EhcZ9+mMZHml6vzUHm
Content-Disposition: attachment; filename=equivs
Content-Type: text/plain; name=equivs; charset=ISO-8859-1
Content-Transfer-Encoding: quoted-printable

#!/dcs/bin/python

# def revcmp(a, b): return cmp(b, a)
# a.sort(revcmp)

import posix
import os
import stat
import sys
import string

class file_bucket:
=09
	def __init__(self,attributes):
		self.attributes=3Dattributes
		self.names=3D[attributes.name]
=09
	def add_name(self,name):
		self.names =3D self.names + [name]
=09
	def __str__(self):
		return string.join(self.names)

class file_attributes:

	def __init__(self,name):
		self.name =3D name
		statbuf=3Dposix.stat(name)
		self.dev =3D statbuf[stat.ST_DEV]
		self.ino =3D statbuf[stat.ST_INO]
		self.len =3D statbuf[stat.ST_SIZE]
		self.len_summed =3D 0
		self.checksum =3D 0

	def extend_sum(self):
		max_chunk=3D32768
		len=3Dself.len-self.len_summed
		if len > max_chunk:
			len=3Dmax_chunk
		if len =3D=3D 0:
			return
		# open for reading only
		fp=3Dposix.open(self.name,0)
		dummy=3Dposix.lseek(fp,self.len_summed,0)
		buf=3Dposix.read(fp,len)
		posix.close(fp)
#		for chr in buf[0:len-1]:
#			self.checksum =3D self.checksum + ord(chr)
		# I suspect a large prime is good here...  Make sure it's larger
		# than 256*max_chunk, or move the % into the loop!
#		self.checksum =3D self.checksum % 366829
		self.checksum =3D (self.checksum + hash(buf)) % 366829
		self.len_summed =3D self.len_summed + len
=09
	# derived from Lib/cmp.py
	def do_cmp(self, other): # Compare two files, for real
		bufsize =3D 8096 # Could be tuned
		self_fp =3D posix.open(self.name, 0)
		other_fp =3D posix.open(other.name, 0)
		while 1:
			self_buf =3D posix.read(self_fp,bufsize)
			other_buf =3D posix.read(other_fp,bufsize)
			if self_buf <> other_buf:
				posix.close(self_fp)
				posix.close(other_fp)
				return 0
			if not self_buf:
				posix.close(self_fp)
				posix.close(other_fp)
				return 1

	# it might be helpful to make this idempotent, someday
	def are_equal(self,other):
		if self.dev =3D=3D other.dev and self.ino =3D=3D other.ino:
			return 1
		if self.len <> other.len:
			return 0
		if self.len_summed =3D=3D 0:
			self.extend_sum()
		if other.len_summed =3D=3D 0:
			other.extend_sum()
		# extend the len_summed's as needed, to make them equal.
		# don't forget that the files must have equal length to reach this
		# point
		while self.len_summed < other.len_summed:
			self.extend_sum()
		while other.len_summed < self.len_summed:
			other.extend_sum()
		# while the sums are equal, and we haven't hit the end of the files...
		while self.len_summed < self.len and self.checksum =3D=3D other.checksum:
			self.extend_sum()
			other.extend_sum()
		if self.checksum <> other.checksum:
			return 0
		# These two files have identical lengths and identical checksums
		# It's time to do it the hard way
		return file_attributes.do_cmp(self,other)
		=09
	def __repr__(self):
		return self.name


def main():
	equivs=3D[]
	# this is currently O(n^2), and operates much like a worst-case insertion
	# sort.  It -could- be done O(nlogn), but I'm not sure I see how to do
	# so without giving up the incremental checksum heuristic.  It might
	# be worth trying without the checksumming someday; probably wouldn't
	# take long - and I suppose either could be allowed with a command-line
	# switch
	if sys.argv[1] =3D=3D '-v':
		del sys.argv[1]
		verbose=3D1
	else:
		verbose=3D0
	fileno=3D0
	for filename in sys.argv[1:]:
		if verbose:
			fileno=3Dfileno+1
			sys.stderr.write('adding '+filename+' '+`fileno`+' ')
			sys.stderr.write(`len(sys.argv)-fileno`+' '+`len(equivs)`+'\n')
		#print filename,equivs
		nextfile =3D file_attributes(filename)
		for possible_match in equivs:
			if file_attributes.are_equal(nextfile,possible_match.attributes):
				possible_match =3D possible_match.add_name(filename)
				break
		else:
			equivs =3D equivs + [file_bucket(nextfile)]
	#print
 	for file in equivs:
		print string.join(file.names)

main()

#filelist=3D[[]]
#for file in sys.argv[1:]:
#	filelist[0]=3Dfilelist[0] + [file]
#
#print filelist
#
#
#stat1=3Dposix.stat(filelist[0][0])
##print stat
#for filenum in range(len(filelist)-1):
#	stat2=3Dposix.stat(filelist[filenum+1][0])
#	print filelist[filenum], filelist[filenum+1]
#	print stat1[stat.ST_SIZE],stat2[stat.ST_SIZE]
#	stat1=3Dstat2
#=09
#
##stat=3Dposix.stat(file)
#


--=-p+EhcZ9+mMZHml6vzUHm--

--=-gGjkL+Zm6tv/vl/DJwP4
Content-Type: application/pgp-signature; name=signature.asc
Content-Description: This is a digitally signed message part

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.0.7 (GNU/Linux)

iD8DBQA/VPnYo0feVm00f/8RAqjtAJ9dWysGEzoKI6xFd0/bKCKajgw2DgCgpPax
IOuAr81jmlZPwX9wY1gWOIs=
=ZONB
-----END PGP SIGNATURE-----

--=-gGjkL+Zm6tv/vl/DJwP4--