[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Mldonkey-commits] mldonkey distrib/ChangeLog src/daemon/driver/dr...
From: |
mldonkey-commits |
Subject: |
[Mldonkey-commits] mldonkey distrib/ChangeLog src/daemon/driver/dr... |
Date: |
Mon, 28 Aug 2006 19:02:17 +0000 |
CVSROOT: /sources/mldonkey
Module name: mldonkey
Changes by: spiralvoice <spiralvoice> 06/08/28 19:02:17
Modified files:
distrib : ChangeLog
src/daemon/driver: driverCommands.ml driverInteractive.ml
Log message:
patch #5341
CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/mldonkey/distrib/ChangeLog?cvsroot=mldonkey&r1=1.979&r2=1.980
http://cvs.savannah.gnu.org/viewcvs/mldonkey/src/daemon/driver/driverCommands.ml?cvsroot=mldonkey&r1=1.167&r2=1.168
http://cvs.savannah.gnu.org/viewcvs/mldonkey/src/daemon/driver/driverInteractive.ml?cvsroot=mldonkey&r1=1.94&r2=1.95
Patches:
Index: distrib/ChangeLog
===================================================================
RCS file: /sources/mldonkey/mldonkey/distrib/ChangeLog,v
retrieving revision 1.979
retrieving revision 1.980
diff -u -b -r1.979 -r1.980
--- distrib/ChangeLog 28 Aug 2006 18:21:09 -0000 1.979
+++ distrib/ChangeLog 28 Aug 2006 19:02:17 -0000 1.980
@@ -15,6 +15,9 @@
=========
2006/08/28
+5341: New command: filenames_variability
+- compute differences between the filenames of a file to detect fakes
+ the higher the value, the more likely its a fake
5335: Fix small bugs in "!" command (execute shell commands)
5334: HTML: Display edk2 links in search results
5333: Clean logging (Swarmer, Fasttrack, Gnutella)
Index: src/daemon/driver/driverCommands.ml
===================================================================
RCS file: /sources/mldonkey/mldonkey/src/daemon/driver/driverCommands.ml,v
retrieving revision 1.167
retrieving revision 1.168
diff -u -b -r1.167 -r1.168
--- src/daemon/driver/driverCommands.ml 28 Aug 2006 18:21:10 -0000 1.167
+++ src/daemon/driver/driverCommands.ml 28 Aug 2006 19:02:17 -0000 1.168
@@ -2717,6 +2717,12 @@
with e -> Printf.sprintf (_b "No file number %d, error %s") num
(Printexc2.to_string e)
), "<num> \"<new name>\" :\t\tchange name of download <num> to <new name>";
+ "filenames_variability", Arg_none (fun o ->
+ let list = List2.tail_map file_info !!files in
+ DriverInteractive.filenames_variability o list;
+ _s "done"
+ ), ":\t\t\t\tTell what files have several very different names";
+
"dllink", Arg_multiple (fun args o ->
let url = String2.unsplit args ' ' in
dllink_parse (o.conn_output = HTML) url
Index: src/daemon/driver/driverInteractive.ml
===================================================================
RCS file: /sources/mldonkey/mldonkey/src/daemon/driver/driverInteractive.ml,v
retrieving revision 1.94
retrieving revision 1.95
diff -u -b -r1.94 -r1.95
--- src/daemon/driver/driverInteractive.ml 28 Aug 2006 18:20:18 -0000
1.94
+++ src/daemon/driver/driverInteractive.ml 28 Aug 2006 19:02:17 -0000
1.95
@@ -2193,3 +2193,180 @@
("", "srh", result); ]
else
Printf.bprintf buf "%s" result
+
+let filenames_variability o list =
+ let debug = false in
+
+ (* minimum distance that must exist between two groups of filenames
+ so they're considered separate *)
+ let gap_threshold = 4 in
+
+ let buf = o.conn_buf in
+
+ let is_alphanum = function
+ | 'A' .. 'Z'
+ | 'a' .. 'z'
+ | '0' .. '9' -> true
+ | _ -> false in
+
+ let canonized_words s =
+ let len = String.length s in
+ let current_word = Buffer.create len in
+ let rec aux i wl =
+ if i < len then
+ if not (is_alphanum s.[i]) then aux (i + 1) wl
+ else begin
+ Buffer.add_char current_word (Char.lowercase s.[i]);
+ aux2 (i + 1) wl
+ end
+ else wl
+ and aux2 i wl =
+ if i < len then
+ if not (is_alphanum s.[i]) then begin
+ let wl = Buffer.contents current_word :: wl in
+ Buffer.reset current_word;
+ aux i wl
+ end else begin
+ Buffer.add_char current_word (Char.lowercase s.[i]);
+ aux2 (i + 1) wl
+ end
+ else Buffer.contents current_word :: wl
+ in
+ aux 0 [] in
+
+ let costs = {
+ Levenshtein.insert_cost = 1;
+ Levenshtein.delete_cost = 1;
+ Levenshtein.replace_cost = 2 } in
+ let dist = Levenshtein.ForWords.distance costs in
+
+ (* fold over all the pairs that can be made with the elements of l *)
+ let list_pair_fold f acc l =
+ let rec aux acc e1 l =
+ let rec aux2 acc e1 l =
+ match l with
+ | [] -> acc
+ | h :: q -> aux2 (f acc e1 h) e1 q in
+ match l with
+ | [] -> acc
+ | h :: q -> aux (aux2 acc e1 l) h q in
+ match l with
+ | [] -> acc
+ | h :: q -> aux acc h q in
+
+ let score_list =
+ List.map (fun fileinfo ->
+ let filenameset = List.map fst fileinfo.file_names in
+ (* canonize filenames by keeping only lowercase words, and
+ sorting them so that initial order doesn't matter;
+ Remove duplicate canonized filenames *)
+ let fns = Array.of_list (List.fold_left (fun acc fn ->
+ let new_fn =
+ Array.of_list (List.sort String.compare (canonized_words fn)) in
+ if List.mem new_fn acc then acc else new_fn :: acc
+ ) [] filenameset) in
+ (* precalculate all Levenshtein distances
+ That's currently the most expensive phase when lots of
+ different filenames exist
+ *)
+ let n = Array.length fns in
+ let matrix = Array.make_matrix n n 0 in
+ (* we can only assume the matrix is symetric if insert and
+ delete costs are the same *)
+ assert (costs.Levenshtein.insert_cost = costs.Levenshtein.delete_cost);
+ for i = 0 to n - 2 do
+ let d1 = dist fns.(i) in
+ for j = i + 1 to n - 1 do
+ matrix.(i).(j) <- d1 fns.(j)
+ done
+ done;
+
+ (* for debugging only *)
+ let rec string_of_set (s, d) =
+ Printf.sprintf "[%s] (%d)"
+ (String.concat "/"
+ (List.map (fun i ->
+ String.concat " " (Array.to_list fns.(i))
+ ) s)) d in
+
+ (* there's one more cluster than gaps between clusters *)
+ let number_of_clusters (s, d) = d + 1 in
+
+ let pair_dist i1 i2 =
+ (* again we assume the matrix is symetric,
+ and that i1 <> i2 *)
+ if i1 < i2 then matrix.(i1).(i2) else matrix.(i2).(i1) in
+
+ (* usual definition of distance between two sets
+ d(E1,E2) = min { e1 in E1, e2 in E2 / d(e1,e2) }
+ *)
+ let rec sets_dist s1 s2 =
+ match s1, s2 with
+ | ([i1], _), ([i2], _) -> pair_dist i1 i2
+ | ([i1], _), (h2 :: q2, d2) ->
+ min (pair_dist i1 h2) (sets_dist s1 (q2, d2))
+ | (h1 :: q1, d1), s2 ->
+ min (sets_dist ([h1], d1) s2) (sets_dist (q1, d1) s2)
+ | _ -> assert false in
+
+ (* initially, each filename is in its own set *)
+ let initial_list_of_sets =
+ let rec aux n l =
+ let n1 = n - 1 in
+ if n1 >= 0 then
+ aux n1 (([n1], 0) :: l)
+ else l in
+ aux n [] in
+
+ let gap d =
+ if d < gap_threshold then 0 else 1 in
+
+ let rec coalesce_sets ls =
+ match ls with
+ | [s] -> s
+ | _ ->
+ (* find two sets with minimal distance and coalesce them *)
+ match (
+ list_pair_fold (fun acc e1 e2 ->
+ let d = sets_dist e1 e2 in
+ match acc with
+ | None -> Some (e1, e2, d)
+ | Some (bs1, bs2, min_dist) ->
+ if d < min_dist then Some (e1, e2, d)
+ else acc) None ls) with
+ | Some (((s1, d1) as e1), ((s2, d2) as e2), min_dist) ->
+ if debug then
+ Printf.printf "Coalesce\n%s and\n%s (distance %d)\n"
+ (string_of_set e1) (string_of_set e2) min_dist;
+ coalesce_sets
+ ((s1 @ s2, d1 + d2 + (gap min_dist)) ::
+ (List.filter (fun e -> e != e1 && e != e2) ls))
+ | None -> assert false
+ in
+ let coalesced_set = coalesce_sets initial_list_of_sets in
+ let nclusters = number_of_clusters coalesced_set in
+
+ fileinfo, nclusters
+ ) list in
+
+ (* files with most clusters at the end of results table *)
+ let sorted_score_list =
+ List.sort (fun (_, nc1) (_, nc2) -> compare nc1 nc2)
+ score_list in
+
+ let print_table = if o.conn_output = HTML then print_table_html 2
+ else print_table_text in
+ print_table buf
+ [|
+ Align_Left; Align_Left; Align_Right |]
+ [|
+ "Num";
+ "File";
+ "Clusters" |]
+ (List.map (fun (fileinfo, nc) ->
+ let n = network_find_by_num fileinfo.file_network in
+ [|
+ Printf.sprintf "[%-s %5d]" n.network_name (fileinfo.file_num);
+ shorten fileinfo.file_name 80;
+ string_of_int nc |]
+ ) sorted_score_list)