mldonkey-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Mldonkey-commits] mldonkey distrib/ChangeLog src/daemon/driver/dr...


From: mldonkey-commits
Subject: [Mldonkey-commits] mldonkey distrib/ChangeLog src/daemon/driver/dr...
Date: Mon, 28 Aug 2006 19:02:17 +0000

CVSROOT:        /sources/mldonkey
Module name:    mldonkey
Changes by:     spiralvoice <spiralvoice>       06/08/28 19:02:17

Modified files:
        distrib        : ChangeLog 
        src/daemon/driver: driverCommands.ml driverInteractive.ml 

Log message:
        patch #5341

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/mldonkey/distrib/ChangeLog?cvsroot=mldonkey&r1=1.979&r2=1.980
http://cvs.savannah.gnu.org/viewcvs/mldonkey/src/daemon/driver/driverCommands.ml?cvsroot=mldonkey&r1=1.167&r2=1.168
http://cvs.savannah.gnu.org/viewcvs/mldonkey/src/daemon/driver/driverInteractive.ml?cvsroot=mldonkey&r1=1.94&r2=1.95

Patches:
Index: distrib/ChangeLog
===================================================================
RCS file: /sources/mldonkey/mldonkey/distrib/ChangeLog,v
retrieving revision 1.979
retrieving revision 1.980
diff -u -b -r1.979 -r1.980
--- distrib/ChangeLog   28 Aug 2006 18:21:09 -0000      1.979
+++ distrib/ChangeLog   28 Aug 2006 19:02:17 -0000      1.980
@@ -15,6 +15,9 @@
 =========
 
 2006/08/28
+5341: New command: filenames_variability
+- compute differences between the filenames of a file to detect fakes
+  the higher the value, the more likely its a fake
 5335: Fix small bugs in "!" command (execute shell commands)
 5334: HTML: Display edk2 links in search results
 5333: Clean logging (Swarmer, Fasttrack, Gnutella)

Index: src/daemon/driver/driverCommands.ml
===================================================================
RCS file: /sources/mldonkey/mldonkey/src/daemon/driver/driverCommands.ml,v
retrieving revision 1.167
retrieving revision 1.168
diff -u -b -r1.167 -r1.168
--- src/daemon/driver/driverCommands.ml 28 Aug 2006 18:21:10 -0000      1.167
+++ src/daemon/driver/driverCommands.ml 28 Aug 2006 19:02:17 -0000      1.168
@@ -2717,6 +2717,12 @@
         with e -> Printf.sprintf (_b "No file number %d, error %s") num 
(Printexc2.to_string e)
     ), "<num> \"<new name>\" :\t\tchange name of download <num> to <new name>";
 
+    "filenames_variability", Arg_none (fun o ->
+      let list = List2.tail_map file_info !!files in
+      DriverInteractive.filenames_variability o list;
+      _s "done"
+    ), ":\t\t\t\tTell what files have several very different names";
+
     "dllink", Arg_multiple (fun args o ->
         let url = String2.unsplit args ' ' in
         dllink_parse (o.conn_output = HTML) url

Index: src/daemon/driver/driverInteractive.ml
===================================================================
RCS file: /sources/mldonkey/mldonkey/src/daemon/driver/driverInteractive.ml,v
retrieving revision 1.94
retrieving revision 1.95
diff -u -b -r1.94 -r1.95
--- src/daemon/driver/driverInteractive.ml      28 Aug 2006 18:20:18 -0000      
1.94
+++ src/daemon/driver/driverInteractive.ml      28 Aug 2006 19:02:17 -0000      
1.95
@@ -2193,3 +2193,180 @@
       ("", "srh", result); ]
   else
     Printf.bprintf buf "%s" result
+
+let filenames_variability o list =
+  let debug = false in
+
+  (* minimum distance that must exist between two groups of filenames
+     so they're considered separate *)
+  let gap_threshold = 4 in
+
+  let buf = o.conn_buf in
+
+  let is_alphanum = function
+    | 'A' .. 'Z'
+    | 'a' .. 'z'
+    | '0' .. '9' -> true
+    | _ -> false in
+
+  let canonized_words s =
+    let len = String.length s in
+    let current_word = Buffer.create len in
+    let rec aux i wl =
+      if i < len then
+       if not (is_alphanum s.[i]) then aux (i + 1) wl
+       else begin
+         Buffer.add_char current_word (Char.lowercase s.[i]);
+         aux2 (i + 1) wl
+       end
+      else wl
+    and aux2 i wl =
+      if i < len then 
+       if not (is_alphanum s.[i]) then begin
+         let wl = Buffer.contents current_word :: wl in
+         Buffer.reset current_word;
+         aux i wl
+       end else begin
+         Buffer.add_char current_word (Char.lowercase s.[i]);
+         aux2 (i + 1) wl
+       end
+      else Buffer.contents current_word :: wl
+    in
+    aux 0 [] in
+
+  let costs = { 
+    Levenshtein.insert_cost = 1; 
+    Levenshtein.delete_cost = 1; 
+    Levenshtein.replace_cost = 2 } in
+  let dist = Levenshtein.ForWords.distance costs in
+
+  (* fold over all the pairs that can be made with the elements of l *)
+  let list_pair_fold f acc l =
+    let rec aux acc e1 l =
+      let rec aux2 acc e1 l =
+       match l with
+       | [] -> acc
+       | h :: q -> aux2 (f acc e1 h) e1 q in
+      match l with
+      | [] -> acc
+      | h :: q -> aux (aux2 acc e1 l) h q in
+    match l with
+    | [] -> acc
+    | h :: q -> aux acc h q in
+
+  let score_list =
+    List.map (fun fileinfo ->
+      let filenameset = List.map fst fileinfo.file_names in
+      (* canonize filenames by keeping only lowercase words, and
+        sorting them so that initial order doesn't matter;
+        Remove duplicate canonized filenames *)
+      let fns = Array.of_list (List.fold_left (fun acc fn ->
+       let new_fn =
+         Array.of_list (List.sort String.compare (canonized_words fn)) in
+       if List.mem new_fn acc then acc else new_fn :: acc
+      ) [] filenameset) in
+      (* precalculate all Levenshtein distances 
+        That's currently the most expensive phase when lots of
+        different filenames exist 
+      *)
+      let n = Array.length fns in
+      let matrix = Array.make_matrix n n 0 in
+      (* we can only assume the matrix is symetric if insert and
+        delete costs are the same *)
+      assert (costs.Levenshtein.insert_cost = costs.Levenshtein.delete_cost);
+      for i = 0 to n - 2 do
+       let d1 = dist fns.(i) in
+       for j = i + 1 to n - 1 do
+         matrix.(i).(j) <- d1 fns.(j)
+       done
+      done;
+
+      (* for debugging only *)
+      let rec string_of_set (s, d) = 
+       Printf.sprintf "[%s] (%d)" 
+         (String.concat "/" 
+           (List.map (fun i -> 
+             String.concat " " (Array.to_list fns.(i))
+           ) s)) d in
+      
+      (* there's one more cluster than gaps between clusters *)
+      let number_of_clusters (s, d) = d + 1 in
+      
+      let pair_dist i1 i2 =
+       (* again we assume the matrix is symetric,
+          and that i1 <> i2 *)
+       if i1 < i2 then matrix.(i1).(i2) else matrix.(i2).(i1) in
+
+      (* usual definition of distance between two sets 
+        d(E1,E2) = min { e1 in E1, e2 in E2 / d(e1,e2) }
+      *)
+      let rec sets_dist s1 s2 =
+       match s1, s2 with
+       | ([i1], _), ([i2], _) -> pair_dist i1 i2
+       | ([i1], _), (h2 :: q2, d2) ->
+           min (pair_dist i1 h2) (sets_dist s1 (q2, d2)) 
+       | (h1 :: q1, d1), s2 -> 
+           min (sets_dist ([h1], d1) s2) (sets_dist (q1, d1) s2) 
+       | _ -> assert false in
+      
+      (* initially, each filename is in its own set *)
+      let initial_list_of_sets = 
+       let rec aux n l =
+         let n1 = n - 1 in
+         if n1 >= 0 then 
+           aux n1 (([n1], 0) :: l) 
+         else l in
+       aux n [] in
+
+      let gap d =
+       if d < gap_threshold then 0 else 1 in
+      
+      let rec coalesce_sets ls =
+       match ls with
+       | [s] -> s
+       | _ ->
+           (* find two sets with minimal distance and coalesce them *)
+           match (
+             list_pair_fold (fun acc e1 e2 ->
+               let d = sets_dist e1 e2 in
+               match acc with
+               | None -> Some (e1, e2, d)
+               | Some (bs1, bs2, min_dist) ->
+                   if d < min_dist then Some (e1, e2, d)
+                   else acc) None ls) with
+           | Some (((s1, d1) as e1), ((s2, d2) as e2), min_dist) ->
+               if debug then
+                 Printf.printf "Coalesce\n%s and\n%s (distance %d)\n"
+                   (string_of_set e1) (string_of_set e2) min_dist;
+               coalesce_sets 
+                 ((s1 @ s2, d1 + d2 + (gap min_dist)) ::
+                   (List.filter (fun e -> e != e1 && e != e2) ls))
+           | None -> assert false
+      in
+      let coalesced_set = coalesce_sets initial_list_of_sets in
+      let nclusters = number_of_clusters coalesced_set in
+      
+      fileinfo, nclusters
+    ) list in
+
+  (* files with most clusters at the end of results table *)
+  let sorted_score_list =
+    List.sort (fun (_, nc1) (_, nc2) -> compare nc1 nc2)
+      score_list in
+
+  let print_table = if o.conn_output = HTML then print_table_html 2
+    else print_table_text in
+  print_table buf
+    [|
+      Align_Left; Align_Left; Align_Right |]
+    [| 
+      "Num";
+      "File";
+      "Clusters" |]
+    (List.map (fun (fileinfo, nc) ->
+      let n = network_find_by_num fileinfo.file_network in
+      [| 
+       Printf.sprintf "[%-s %5d]" n.network_name (fileinfo.file_num);
+       shorten fileinfo.file_name 80;
+       string_of_int nc |]
+    ) sorted_score_list)




reply via email to

[Prev in Thread] Current Thread [Next in Thread]