Skip to content

Commit

Permalink
wshiml
Browse files Browse the repository at this point in the history
  • Loading branch information
unhammer committed May 18, 2015
0 parents commit 7b7be6f
Show file tree
Hide file tree
Showing 15 changed files with 1,145 additions and 0 deletions.
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
_build/
setup.data
setup.log
*.byte
*.native
*/*.mldylib
*/*.mllib
*.docdir
4 changes: 4 additions & 0 deletions .merlin
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
S src
B _build/**

PKG cmdliner
16 changes: 16 additions & 0 deletions .ocamlinit
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
(* -*- mode: tuareg -*- *)

(* Added by OPAM. *)
let () =
try Topdirs.dir_directory (Sys.getenv "OCAML_TOPLEVEL_PATH")
with Not_found -> ()
;;

#directory "_build/src"
#use "topfind"
#require "bytes"
#load "wshiml.cma"

#require "ppx_deriving.std"
let pp_int_couple = Ymse.Couple.pp Format.pp_print_int;;
#install_printer pp_int_couple;;
Empty file added CHANGES.md
Empty file.
515 changes: 515 additions & 0 deletions COPYING

Large diffs are not rendered by default.

Empty file added DEVEL.md
Empty file.
41 changes: 41 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# OASIS_START
# DO NOT EDIT (digest: a3c674b4239234cbbe53afe090018954)

SETUP = ocaml setup.ml

build: setup.data
$(SETUP) -build $(BUILDFLAGS)

doc: setup.data build
$(SETUP) -doc $(DOCFLAGS)

test: setup.data build
$(SETUP) -test $(TESTFLAGS)

all:
$(SETUP) -all $(ALLFLAGS)

install: setup.data
$(SETUP) -install $(INSTALLFLAGS)

uninstall: setup.data
$(SETUP) -uninstall $(UNINSTALLFLAGS)

reinstall: setup.data
$(SETUP) -reinstall $(REINSTALLFLAGS)

clean:
$(SETUP) -clean $(CLEANFLAGS)

distclean:
$(SETUP) -distclean $(DISTCLEANFLAGS)

setup.data:
$(SETUP) -configure $(CONFIGUREFLAGS)

configure:
$(SETUP) -configure $(CONFIGUREFLAGS)

.PHONY: build doc test all install uninstall reinstall clean distclean configure

# OASIS_STOP
28 changes: 28 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
Wshiml
========

Implementation of
http://nlp.stanford.edu/IR-book/html/htmledition/near-duplicates-and-shingling-1.html

Build requires [oasis](http://oasis.forge.ocamlcore.org/index.html). Do:

./configure # optionally with --prefix
make
make install

To build the example command-line program, do

./configure --enable-cli
make
make install
find-similar-docs --help

The command-line program requires
[cmdliner](http://erratique.ch/software/cmdliner). The rest of the
software has no dependencies apart from Oasis for building from git.

So far the code is completely unoptimised apart from what's described
in
http://nlp.stanford.edu/IR-book/html/htmledition/near-duplicates-and-shingling-1.html
and uses 12s (9s with super-shingling) to cluster 1458 documents of
altogether 556176 words on an old 2.8 GHz AMD.
40 changes: 40 additions & 0 deletions _oasis
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
OASISFormat: 0.4
Name: wshiml
Version: 0.1.0
Synopsis: Near-duplicate document detection by word shingling
Authors: Kevin Brubeck Unhammer
License: LGPL-2.1 with OCaml linking exception
BuildTools: ocamlbuild
Plugins: META (0.4), DevFiles (0.4)

Description: Near-duplicate document detection by word shingling; library and example command line program.
Homepage: https://github.com/unhammer/wshiml

Library "wshiml"
Path: src
Modules: Wshiml
InternalModules: Ymse

Flag "cli"
Description: Install example command line program to find similar docs
Default: false

Executable "find-similar-docs"
Build$: flag(cli)
Install$: flag(cli)
Path: examples
MainIs: find_similar_docs.ml
BuildDepends: wshiml, cmdliner
CompiledObject: best


AlphaFeatures: ocamlbuild_more_args

Document "wshiml_api"
Type: ocamlbuild (0.4)
BuildTools: ocamldoc
Title: API reference for Wshiml
XOCamlbuildPath: .
XOCamlbuildExtraArgs:
"-docflags '-colorize-code -short-functors -charset utf-8'"
XOCamlbuildLibraries: wshiml
27 changes: 27 additions & 0 deletions configure
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/sh

# OASIS_START
# DO NOT EDIT (digest: dc86c2ad450f91ca10c931b6045d0499)
set -e

FST=true
for i in "$@"; do
if $FST; then
set --
FST=false
fi

case $i in
--*=*)
ARG=${i%%=*}
VAL=${i##*=}
set -- "$@" "$ARG" "$VAL"
;;
*)
set -- "$@" "$i"
;;
esac
done

ocaml setup.ml -configure "$@"
# OASIS_STOP
67 changes: 67 additions & 0 deletions examples/find_similar_docs.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
let slurp_file filename =
let lines = ref [] in
let chan = open_in filename in
let rl = try
while true; do
lines := input_line chan :: !lines
done; !lines
with End_of_file ->
close_in chan;
!lines
in
List.rev rl |> Bytes.concat "\n"

let print_clusters (files: bytes array) (clusters: Wshiml.clusters) =
List.filter (function a::b::_ -> true | _ -> false) clusters
|> List.map (fun l -> List.map (Array.get files) l |> Bytes.concat "\t")
|> Bytes.concat "\n"
|> print_endline

let print_scores =
List.iter (fun ((d1,d2),score) -> Printf.printf "%d & %d = %d\n" d1 d2 score)


let find_similar_docs threshold super files =
let files_a = Array.of_list files in
let ndocs = Array.length files_a in
Wshiml.sketch_docs ~slurp_file files
|> (fun sketch -> if super then Wshiml.supersketches sketch else sketch)
|> Wshiml.score_sketches ~threshold
(* DEBUG: *)
(* |> (fun scores -> print_scores scores; print_endline ""; scores) *)
|> Wshiml.cluster_scores ~ndocs
|> print_clusters files_a


open Cmdliner

let super =
let doc = "Create a super-shingle (shingle of shingles) before clustering. May speed up clustering." in
Arg.(value & flag & info ["s"; "super"] ~docv:"SUPER" ~doc)

let threshold =
let doc = "The overlap threshold for considering two documents similar." in
Arg.(value & opt float 0.8 & info ["t"; "threshold"] ~docv:"THRESHOLD" ~doc)

let files =
let doc = "The documents to compare." in
Arg.(non_empty & pos_all file [] & info [] ~docv:"FILES" ~doc)

let fnd_t = Term.(pure find_similar_docs $ threshold $ super $ files)

let info =
let doc = "find similar or near duplicate documents" in
let man = [
`S "DESCRIPTION";
`P "$(tname) uses word shingling to find near duplicate or similar
documents among files specified on the command line.";
`P "The output is tab-separated, one cluster per line.";
`P "Note that the comparison is probabilistic, and results may
differ between runs.";
`S "BUGS";
`P "Email bug reports to <unhammer at fsfe.org>.";
] in
Term.info "find-similar-docs" ~version:"0.1.0" ~doc ~man

let () =
match Term.eval (fnd_t, info) with `Error _ -> exit 1 | _ -> exit 0
37 changes: 37 additions & 0 deletions setup.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
(* setup.ml generated for the first time by OASIS v0.4.5 *)

(* OASIS_START *)
(* DO NOT EDIT (digest: 9852805d5c19ca1cb6abefde2dcea323) *)
(******************************************************************************)
(* OASIS: architecture for building OCaml libraries and applications *)
(* *)
(* Copyright (C) 2011-2013, Sylvain Le Gall *)
(* Copyright (C) 2008-2011, OCamlCore SARL *)
(* *)
(* This library is free software; you can redistribute it and/or modify it *)
(* under the terms of the GNU Lesser General Public License as published by *)
(* the Free Software Foundation; either version 2.1 of the License, or (at *)
(* your option) any later version, with the OCaml static compilation *)
(* exception. *)
(* *)
(* This library is distributed in the hope that it will be useful, but *)
(* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *)
(* or FITNESS FOR A PARTICULAR PURPOSE. See the file COPYING for more *)
(* details. *)
(* *)
(* You should have received a copy of the GNU Lesser General Public License *)
(* along with this library; if not, write to the Free Software Foundation, *)
(* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA *)
(******************************************************************************)

let () =
try
Topdirs.dir_directory (Sys.getenv "OCAML_TOPLEVEL_PATH")
with Not_found -> ()
;;
#use "topfind";;
#require "oasis.dynrun";;
open OASISDynRun;;

(* OASIS_STOP *)
let () = setup ();;
Loading

0 comments on commit 7b7be6f

Please sign in to comment.