|
@@ -0,0 +1,226 @@
|
|
|
|
|
+{
|
|
|
|
|
+ pattern_weight = "(oz|lb|lbs|g|kg)"
|
|
|
|
|
+ pattern_volume = "(ml|l|L|ML|fl oz|fl. oz|fl.oz)"
|
|
|
|
|
+ pattern_count = "(each|piece|pieces|bag|bags|sheet|sheets|count|ct|pack|pk)"
|
|
|
|
|
+ unit_any = "(" pattern_weight "|" pattern_volume "|" pattern_count ")"
|
|
|
|
|
+ ""
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+function canonical_unit(u, lu) {
|
|
|
|
|
+ lu = tolower(u)
|
|
|
|
|
+
|
|
|
|
|
+ # Weight
|
|
|
|
|
+ if (lu ~ /^lb$|^lbs$/) return "lb"
|
|
|
|
|
+ if (lu == "oz") return "oz"
|
|
|
|
|
+ if (lu == "g") return "g"
|
|
|
|
|
+ if (lu == "kg") return "kg"
|
|
|
|
|
+
|
|
|
|
|
+ # Volume
|
|
|
|
|
+ if (lu == "ml") return "mL"
|
|
|
|
|
+ if (lu == "l") return "L"
|
|
|
|
|
+ if (lu ~ /^floz$|^fl oz$|^fl\.oz$|^fl\. oz$|^fl\. oz\.$/) return "fl. oz."
|
|
|
|
|
+
|
|
|
|
|
+ # Count
|
|
|
|
|
+ if (lu ~ /^each$|^piece$|^pieces$/) return "Pieces"
|
|
|
|
|
+ if (lu ~ /^count$|^ct$/) return "Pieces"
|
|
|
|
|
+ if (lu ~ /^pack$|^pk$/) return "Pieces"
|
|
|
|
|
+ if (lu ~ /^bag$|^bags$/) return "Bags"
|
|
|
|
|
+ if (lu ~ /^sheet$|^sheets$/) return "Sheets"
|
|
|
|
|
+
|
|
|
|
|
+ return u
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+function load_products(filename, line) {
|
|
|
|
|
+ n_products = 0
|
|
|
|
|
+ while ((getline line < filename) > 0) {
|
|
|
|
|
+ products[++n_products] = line
|
|
|
|
|
+ }
|
|
|
|
|
+ close(filename)
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+function load_translations(filename, line, i, j) {
|
|
|
|
|
+ n_translate = 0
|
|
|
|
|
+ while ((getline line < "translate.txt") > 0) {
|
|
|
|
|
+ # Parse format: source=>target
|
|
|
|
|
+ if (match(line, /(.*)=>\s*(.*)/, arr)) {
|
|
|
|
|
+ source[++n_translate] = arr[1]
|
|
|
|
|
+ target[n_translate] = arr[2]
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ close("translate.txt")
|
|
|
|
|
+
|
|
|
|
|
+ # Sort by length of source descending (longest first)
|
|
|
|
|
+ for (i = 1; i <= n_translate-1; i++) {
|
|
|
|
|
+ for (j = i+1; j <= n_translate; j++) {
|
|
|
|
|
+ if (length(source[i]) < length(source[j])) {
|
|
|
|
|
+ tmp = source[i]; source[i] = source[j]; source[j] = tmp
|
|
|
|
|
+ tmp = target[i]; target[i] = target[j]; target[j] = tmp
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+function translate_string(s, i) {
|
|
|
|
|
+ for (i = 1; i <= n_translate; i++) {
|
|
|
|
|
+ gsub(source[i], target[i], s) # exact substring match, case-insensitive if IGNORECASE=1
|
|
|
|
|
+ }
|
|
|
|
|
+ return s
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+function load_blacklist(filename, line, i, j) {
|
|
|
|
|
+ n_blacklist = 0
|
|
|
|
|
+ while ((getline line < "blacklist.txt") > 0) {
|
|
|
|
|
+ blacklist[++n_blacklist] = line
|
|
|
|
|
+ }
|
|
|
|
|
+ close("blacklist.txt")
|
|
|
|
|
+
|
|
|
|
|
+ # Sort blacklist by length descending (longest first)
|
|
|
|
|
+ for (i = 1; i <= n_blacklist-1; i++) {
|
|
|
|
|
+ for (j = i+1; j <= n_blacklist; j++) {
|
|
|
|
|
+ if (length(blacklist[i]) < length(blacklist[j])) {
|
|
|
|
|
+ tmp = blacklist[i]; blacklist[i] = blacklist[j]; blacklist[j] = tmp
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+function strip_blacklist(s, i) {
|
|
|
|
|
+ for (i = 1; i <= n_blacklist; i++) {
|
|
|
|
|
+ gsub(blacklist[i], "", s) # exact substring match
|
|
|
|
|
+ }
|
|
|
|
|
+ # collapse multiple spaces
|
|
|
|
|
+ gsub(/[[:space:]]+/, " ", s)
|
|
|
|
|
+ sub(/^ /, "", s)
|
|
|
|
|
+ sub(/ $/, "", s)
|
|
|
|
|
+ return s
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+function parse_organic(line){
|
|
|
|
|
+ return (tolower(line) ~ / org | org\.|organic/) ? "true" : "false"
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+function parse_total(line, price_paid, m){
|
|
|
|
|
+ if (match(line, /\$[0-9]+\.[0-9][0-9]/, m)) {
|
|
|
|
|
+ price_paid = m[0]
|
|
|
|
|
+ gsub(/^\$/, "", price_paid)
|
|
|
|
|
+ } else {
|
|
|
|
|
+ return ""
|
|
|
|
|
+ }
|
|
|
|
|
+ return price_paid
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+function parse_amount_unit(line, description, d, q1, q2, q3, rest, dunit, damount, patA, patB){
|
|
|
|
|
+ # Pattern A: 2.42 lbs x $1.99 each
|
|
|
|
|
+ patA = "([0-9.]+)[ ]*(" unit_any ")[ ]*x"
|
|
|
|
|
+ patB = "[^$]([0-9.]+)[ ]*(" unit_any ")[ ]*"
|
|
|
|
|
+ patC = "^([0-9.]+)[ ]*x"
|
|
|
|
|
+ if (match(line, patA, q1)) {
|
|
|
|
|
+ amount = q1[1]
|
|
|
|
|
+ unit = canonical_unit(q1[2])
|
|
|
|
|
+ }
|
|
|
|
|
+ # Pattern B: 6ea/0.850 kg
|
|
|
|
|
+ else if (match(line, patB, q2)){
|
|
|
|
|
+ amount = q2[1]
|
|
|
|
|
+ unit = canonical_unit(q2[2])
|
|
|
|
|
+ }
|
|
|
|
|
+ # Pattern C: 1 x $9.99 each
|
|
|
|
|
+ else if (match(line, patC, q3)) {
|
|
|
|
|
+ amount = q3[1]
|
|
|
|
|
+ # unit resolved later
|
|
|
|
|
+ }
|
|
|
|
|
+ rest = description
|
|
|
|
|
+ dunit = ""
|
|
|
|
|
+ damount = ""
|
|
|
|
|
+ while(match(rest, "([0-9.]+)[ ]*(" unit_any ")", d)){
|
|
|
|
|
+ rest = substr(rest, RSTART + RLENGTH)
|
|
|
|
|
+ dunit = canonical_unit(d[2])
|
|
|
|
|
+ damount = d[1]
|
|
|
|
|
+ }
|
|
|
|
|
+ if((dunit != "" && damount != "" && damount + 0 > amount + 0) && (unit == "Pieces" || unit == "")) {
|
|
|
|
|
+ unit = dunit
|
|
|
|
|
+ amount = damount
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (amount == "") amount = 1
|
|
|
|
|
+ if (unit == "") unit = "Pieces"
|
|
|
|
|
+ return ""
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+###############################################################################
|
|
|
|
|
+# Compute Levenshtein distance (standard implementation)
|
|
|
|
|
+###############################################################################
|
|
|
|
|
+function levenshtein(a, b, la, lb, i, j, cost, d) {
|
|
|
|
|
+ la = length(a)
|
|
|
|
|
+ lb = length(b)
|
|
|
|
|
+
|
|
|
|
|
+ # Create matrix
|
|
|
|
|
+ for (i = 0; i <= la; i++) d[i,0] = i
|
|
|
|
|
+ for (j = 0; j <= lb; j++) d[0,j] = j
|
|
|
|
|
+
|
|
|
|
|
+ # Fill dynamic table
|
|
|
|
|
+ for (i = 1; i <= la; i++) {
|
|
|
|
|
+ for (j = 1; j <= lb; j++) {
|
|
|
|
|
+ cost = (substr(a,i,1) == substr(b,j,1) ? 0 : 1)
|
|
|
|
|
+ d[i,j] = d[i-1,j] + 1 # deletion
|
|
|
|
|
+ if ((tmp = d[i,j-1] + 1) < d[i,j]) d[i,j] = tmp # insertion
|
|
|
|
|
+ if ((tmp = d[i-1,j-1] + cost) < d[i,j]) d[i,j] = tmp # substitution
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ return d[la,lb]
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+function norm(s) {
|
|
|
|
|
+ s = tolower(s)
|
|
|
|
|
+ gsub(/[^a-z0-9 ]/, "", s)
|
|
|
|
|
+ gsub(/[ ]+/, " ", s)
|
|
|
|
|
+ sub(/^ /, "", s)
|
|
|
|
|
+ sub(/ $/, "", s)
|
|
|
|
|
+ return s
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+# returns 1 if at least one word in common, 0 otherwise
|
|
|
|
|
+function has_shared_word(line, prod, dist, desc, dw, pw, i, j) {
|
|
|
|
|
+ desc = norm(line)
|
|
|
|
|
+ prod = norm(prod)
|
|
|
|
|
+
|
|
|
|
|
+ n_desc = split(desc, dw, " ")
|
|
|
|
|
+ n_prod = split(prod, pw, " ")
|
|
|
|
|
+
|
|
|
|
|
+ for (i = 1; i <= n_desc; i++){
|
|
|
|
|
+ if (length(dw[i]) < 3) continue
|
|
|
|
|
+ for (j = 1; j <= n_prod; j++){
|
|
|
|
|
+ dist = levenshtein(dw[i], pw[j])
|
|
|
|
|
+ dist = 1 - (dist / (length(dw[i]) > length(pw[j]) ? length(dw[i]) : length(pw[j])))
|
|
|
|
|
+ if (dw[i] == pw[j] || dist > 0.8) return 1
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return 0
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+function fuzzy_product(description, best, bestdist, dprod, descnorm, pnorm, dist, i) {
|
|
|
|
|
+ descnorm = norm(description)
|
|
|
|
|
+ bestdist = 0
|
|
|
|
|
+ best = ""
|
|
|
|
|
+
|
|
|
|
|
+ for (i = 1; i <= n_products; i++) {
|
|
|
|
|
+ pnorm = norm(products[i])
|
|
|
|
|
+ if (!has_shared_word(descnorm, pnorm)) continue
|
|
|
|
|
+ dist = levenshtein(descnorm, pnorm)
|
|
|
|
|
+ if (index(descnorm, pnorm) > 0) dist -= 1
|
|
|
|
|
+ #print descnorm " " pnorm " " dist
|
|
|
|
|
+ dist = 1 - (dist / (length(descnorm) > length(pnorm) ? length(descnorm) : length(pnorm)))
|
|
|
|
|
+
|
|
|
|
|
+ if (dist > bestdist) {
|
|
|
|
|
+ bestdist = dist
|
|
|
|
|
+ best = products[i]
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ #print description " " best " " bestdist
|
|
|
|
|
+ if (bestdist >= 0.25)
|
|
|
|
|
+ return best
|
|
|
|
|
+ else
|
|
|
|
|
+ return ""
|
|
|
|
|
+}
|
|
|
|
|
+
|