-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcoreuniq.between.two.file.cr
129 lines (116 loc) · 4.59 KB
/
coreuniq.between.two.file.cr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
require "admiral"
class GrepFile < Admiral::Command
define_argument target,
description: "target file",
required: true
define_argument query,
description: "query file",
required: true
define_flag column_target : Int32,
default: 1_i32,
description: ""
define_flag column_query : Int32,
default: 1_i32,
description: ""
define_flag ignore_line_mathed_by : String,
default: "^[#@]",
description: "if id start with # or @, will remove # or @, support regex syntax"
define_flag delete_chars_from_column : String,
default: "^>",
description: "delete id first chars, support regex syntax"
define_flag sep_query : String,
default: "\t",
description: "query separator, '\\t' or '\\s'"
define_flag sep_target : String,
default: "\t",
description: "target separator, '\\t' or '\\s'"
define_flag prefix : String,
default: "myth",
description: "prefix of output"
define_help description: "A replace for grep -f(which cost too many memory)"
define_version "1.0.2"
COMPILE_TIME = Time.local
def run
if ARGV.size == 0
#puts "complie time: #{COMPILE_TIME}"
#app = __FILE__.gsub(/\.cr$/, "")
#puts `#{app} --help`
#exit 1
GrepFile.run "--help"
end
query_ids = {} of String => String
query_ids_num = 0
ignore_line_mathed_by = flags.ignore_line_mathed_by
puts "--ignore-line-mathed-by #{ignore_line_mathed_by}"
puts "--delete-chars-from-column #{flags.delete_chars_from_column}" if flags.delete_chars_from_column != ""
ignore = false
query_name = Path[arguments.query].basename
target_name = Path[arguments.target].basename
# read query file
puts "start read query"
File.each_line(arguments.query) do |line|
next if ignore_line_mathed_by !="" && line.match(/#{ignore_line_mathed_by}/)
next if line.match(/^\s*$/)
arr = line.split(/#{flags.sep_query}/)
raise "error: #{arguments.query} only have #{arr.size} column, but --column_query #{flags.column_query}, try to change --query-sep " if flags.column_query > arr.size
id = arr[flags.column_query - 1]
id = id.gsub(/#{flags.delete_chars_from_column}/, "") if flags.delete_chars_from_column != ""
#puts "qid is #{id}"
unless query_ids.has_key?(id)
query_ids_num = query_ids_num +1
query_ids[id] = ""
end
end
puts "start read target"
## read target file
target_ids = {} of String => String
target_ids_num = 0
File.each_line(arguments.target) do |line|
next if ignore_line_mathed_by != "" && line.match(/#{ignore_line_mathed_by}/)
next if line.match(/^\s*$/)
arr = line.split(/#{flags.sep_target}/)
raise "error: #{arguments.target} only have #{arr.size} column, but --column_target #{flags.column_target}, try to change --target-sep " if flags.column_target > arr.size
id = arr[flags.column_target - 1]
id = id.gsub(/#{flags.delete_chars_from_column}/, "") if flags.delete_chars_from_column != ""
#puts "tid is #{id}"
unless target_ids.has_key?(id)
target_ids_num = target_ids_num + 1
target_ids[id] = ""
end
end
puts "start get query uniq"
core_out = File.open("#{flags.prefix}.coreid.list", "w")
core_number = 0
query_ids_uniq_num = 0
qout = File.open("#{flags.prefix}.q.column#{flags.column_query}.uniqid.list", "w")
query_ids.each do |key, value|
unless target_ids.has_key?(key)
qout.puts(key)
query_ids_uniq_num = query_ids_uniq_num + 1
else
core_out.puts(key)
core_number = core_number + 1
end
end
qout.close
core_out.close
puts "start get target uniq"
target_ids_uniq_num = 0
tout = File.open("#{flags.prefix}.t.column#{flags.column_target}.uniqid.list", "w")
target_ids.each do |key, value|
unless query_ids.has_key?(key)
tout.puts(key)
target_ids_uniq_num = target_ids_uniq_num + 1
end
end
tout.close
raise "error: core_number+target_ids_uniq_num != target_ids_num: #{core_number}+#{target_ids_uniq_num} != #{target_ids_num}\n" if core_number+target_ids_uniq_num != target_ids_num
raise "error: core_number+query_ids_uniq_num != query_ids_num: #{core_number}+#{query_ids_uniq_num} != #{query_ids_num}\n" if core_number+query_ids_uniq_num != query_ids_num
out_stat = "db\tcore_number\tuniq_number\tcore_number_percent\tuniq_number_percent\ttotal_number\n"
out_stat += "#{target_name}\t#{core_number}\t#{target_ids_uniq_num}\t#{core_number.to_f/target_ids_num}\t#{target_ids_uniq_num.to_f/target_ids_num}\t#{target_ids_num}\n"
out_stat += "#{query_name}\t#{core_number}\t#{query_ids_uniq_num}\t#{core_number.to_f/query_ids_num}\t#{query_ids_uniq_num.to_f/query_ids_num}\t#{query_ids_num}\n"
puts "#{out_stat}\n"
File.write("#{flags.prefix}.diff.stat.txt", out_stat)
end
end
GrepFile.run