...
class LabDataLine {
static col_names = ['LABDT','LABTM','REQUISITIONNO','SUBJECT','SUBJECTINITIALS','SEX','SUBJECTDOB','SPONSOR','PROTOCOL','PHYSICIANNM','SITENO','VISIT','VISITNO','GROUPPNL','TESTNM','RANGEN','RESULTTXT','RESULTREAL','RANGELOW','RANGEHI','FLAG','UNIT','FOOTNOTE','LOCCODE','LOCADDR']
static pks = ['PHYSICIANNM','SITENO','SUBJECT','VISIT','VISITNO','LABDT','LABTM','GROUPPNL','TESTNM'], cols = [:]
static{col_names.eachWithIndex{o,i->
cols[o]=i
}}
static REGEXP = ['SUBJECT':/00\d{1}\-\d{4}/,'VISITNO':/^NA$|^[1-9]{1}\d*/,'LABDT':/(([0-9]{3}[1-9]|[0-9]{2}[1-9][0-9]{1}|[0-9]{1}[1-9][0-9]{2}|[1-9][0-9]{3})(((0[13578]|1[02])(0[1-9]|[12][0-9]|3[01]))|((0[469]|11)(0[1-9]|[12][0-9]|30))|(02(0[1-9]|[1][0-9]|2[0-8]))))|((([0-9]{2})(0[48]|[2468][048]|[13579][26])|((0[48]|[2468][048]|[3579][26])00))0229)/,'LABTM':/^$|((0[0-9])|(1[0-9])|(2[0-3]))([0-5][0-9])([0-5][0-9])/,'RESULTREAL':/^$|^[-+]?\d+(\.\d+)?$/,'RANGELOW':/^$|^[-+]?\d+(\.\d+)?$/,'RANGEHI':/^$|^[-+]?\d+(\.\d+)?$/]
def location, line, arr
public LabDataLine(m){
location = m.location
line = m.line
arr = this.line.split('\\|')
}
public static String genHd() {
def s = []
col_names.each{s << it}
"<th>${s.join('</th><th>')}</th>"
}
public String toString(){
def s = []
col_names.each{
def data = arr[cols[it]]
s << (data==''?' ':data)
}
"<td>${s.join('</td><td>')}</td>"
}
public String toString(highlightedProps){
def s = []
col_names.each{
def data = arr[cols[it]]
def hl = highlightedProps.contains(it)
s << (data==''?' ':"${hl?'<b>':''}$data${hl?'</b>':''}")
}
"<td>${s.join('</td><td>')}</td>"
}
public int hashCode(){
def s = []
pks.each{s << arr[cols[it]]}
s.join('|').hashCode()
}
public boolean equals(Object o){
this.hashCode() == o.hashCode()
}
}
def startAt = new Date().time
def files = new File('../data').listFiles().findAll{it.name=~/(?i)\.txt$/}
files.sort{f1,f2->
f1.lastModified()<=>f2.lastModified()
}
def dataFiles = [files[-1], files[-2]].sort{f1,f2->f1.name<=>f2.name}
def dataset = [:], h2dl = [[:],[:],[:]]
// preparation
(-2..-1).each{i->
dataFiles[i].eachLine{l,n->
if(n>1){
def location = "$n@file${i+3}"
def dl = new LabDataLine(location:location,line:l)
dataset[location] = dl
def hc = l.hashCode()
h2dl[i+2][hc] = h2dl[i+2][hc]?:[]
h2dl[i+2][hc] << location
def hc2 = dl.hashCode()
h2dl[-1][hc2] = h2dl[-1][hc2]?:[:]
h2dl[-1][hc2][hc] = h2dl[-1][hc2][hc] ?: []
h2dl[-1][hc2][hc] << dl
}
}
}
def report = []
report << "<html><head><title>Lab Data Validation Report</title><style type=\"text/css\">table, tr, td {border-width: 1px;border-spacing: 2px;border-style: outset;border-color: blue;border-collapse: separate;}th{background-color:red;}.even-row{background-color:#CECEF6;}.odd-row{background-color:#A9D0F5;}</style></head><body>file1: ${dataFiles[0].name}<br>file2: ${dataFiles[1].name}<hr>"
//#1. unit validation
report << "---unit validation---<br>"
report << "<table><tr><th>Location</th><th>Field</th><th>Value</th></tr>"
def j=0
dataset.each{dlkv->
def dl = dlkv.value
dl.pks.each{pk->
if(dl.arr[dl.cols[pk]].trim() == ''){
report << "<tr class=\"${j++%2==0?'even-row':'odd-row'}\"><td>${dl.location}</td><td>$pk</td><td>NULL</td></tr>"
}
}
dl.REGEXP.each{kv->
if(!(dl.arr[dl.cols[kv.key]] =~ kv.value)){
report << "<tr class=\"${j++%2==0?'even-row':'odd-row'}\"><td>${dl.location}</td><td>${kv.key}</td><td>${dl.arr[dl.cols[kv.key]]}</td><tr>"
}
}
}
report << "</table><hr>"
//#2. integration validation :: duplicated lines in each file
report << "---duplicated lines in each file---<br>"
report << "<table><tr><th>Location</th>${LabDataLine.genHd()}</tr>"
(0..1).each{i->
h2dl[i].each{kv->
if(kv.value.size()>1){
def tmp = []
kv.value.each{e->
tmp << "<tr><td>$e</td>${dataset[e]}</tr>"
}
report << tmp
}
}
}
report << "</table><hr>"
//#3. integration validation :: unequal lines with equal PKs in the two files
report << "---unequal lines with equal PKs in the two files---<br>"
report << "<table><tr><th>Location</th>${LabDataLine.genHd()}</tr>"
def seq1 = 1, colors = ['Aqua', 'Orange'], ci = 0
h2dl[-1].each{
if(it.value.size()>1){
def seq2 = 1
it.value.each{dl->
def d = dl.value[0]
def pkgrp = it.value
def highlightedProps = []
d.col_names.each{col->
if(pkgrp.findAll{kv->kv.value[0].arr[d.cols[col]]==d.arr[d.cols[col]]}.size()==1){
highlightedProps << col
}
}
report << "<tr style=\"background:${colors[ci]};\"><td>${seq1}.${seq2}<br>${dl.value.location.join('<br>')}</td>${dl.value[0].toString(highlightedProps)}</tr>"
seq2++
}
ci = seq1++%2
}
}
report << "</table><hr>"
//#4. integration validation :: lines in file1 but not in file2(by PKs)
report << "---lines in file1 but not in file2(by PKs)---<br>"
report << "<table><tr><th>Location</th>${LabDataLine.genHd()}</tr>"
def hss = [new HashSet(), new HashSet()]
dataset.each{
hss[it.value.location=~/@file1/?0:1] << it.value
}
hss[0].minus(hss[1]).eachWithIndex{o,i->
report << "<tr class=\"${i%2==0?'even-row':'odd-row'}\"><td>${o.location}</td>${o}</tr>"
}
report << "</table><hr>"
def msg = "Validation script by S.C. Global Research Services, LLC.<br>Time elapsed: ${new Date().time - startAt} ms"
report << "<p>$msg</p>"
new File("../output/report.htm").write(report.join('\n'))
...
1万+

被折叠的 条评论
为什么被折叠?



