Longest common word prefix

Difference between version 1 and 4 - Previous - Next
[dbohdan] 2018-03-26: The following module lets you find the longest common ''word prefix'' of two strings, which is to say, the first ''N'' words the strings have in common. For our purposes ''words'' are defined as string fragments separated by separators. A separator is a string that matches a given regular expression understood by `[regexp]`.


** See also **

   * `longestCommonPrefix` and `splitx` in [textutil]
   * `longestCommonSubsequence` in [struct::list]
   * `sepsplit` in [Sqawk]


** Code **

Download with [wiki-reaper]: `wiki-reaper -x 55230 0 | tee lcwp.tcl`

======
#! /usr/bin/env tclsh# Copyright (c) 2018, db2021 D. Bohdan
# License: MITpackage require Tcl 8.5-10

namespace eval ::lcwp {    variable version 0.23.0
    interp alias {} lcwp {} longest-common-word-prefix
}

proc ::lcwp::longest-common-word-prefix {
    s1 s2 {sep {\s+}} {includeTailSep 0}
} {
    if {[string length $s2] > [string length $s1]} {
        set t $s2
        set s2 $s1
        set s1 $t
        unset t
    }

    set offset 0
    set tailSepLength 0
    while 1 {
        lassign [read-word $s1 $offset $sep] label1 \
                                             matchedFramement1 \
                                             matchedSep1
        lassign [read-word $s2 $offset $sep] label2 \
                                             matchedFramement2 \
                                             matchedSep2

        # Handle fragments.
        if {$matchedFramement1 ne $matchedFramement2} {
            break
        }
        set fragmentLength [string length $matchedFramement1]
        incr offset $fragmentLength
        if {$fragmentLength > 0} {
            set tailSepLength 0
        }

        # Handle separators.
        if {$matchedSep1 ne $matchedSep2} {
            break
        }
        incr tailSepLength [string length $matchedSep1]
        incr offset [string length $matchedSep1]

        # Handle string end.
        if {$label1 eq {END} || $label2 eq {END}} {
            break
        }
    }
    if {!$includeTailSep} {
        incr offset -$tailSepLength
    }
    return [string range $s1 0 $offset-1]
}

proc ::lcwp::read-word {s offset sep} {
    if {[regexp -indices -start $offset -- $sep $s match]} {
        lassign $match start end
        set matchedFramement [string range $s $offset $start-1]
        set matchedSep [string range $s $start $end]
        set label MORE
    } else {
        set matchedFramement [string range $s $offset end]
        set matchedSep {}
        set label END
    }
    return [list $label $matchedFramement $matchedSep]
}

proc ::lcwp::replace-prefix {prefix s {replacement { }}} {
    set prefixLen [string length $prefix]
    set replacementLen [string length $replacement]
    set repeats [expr {
        $replacementLen > 0 ?
        $prefixLen / $replacementLen + 1 :
        0
    }]
    set newPrefix [string range [string repeat $replacement $repeats] \
                                0 \
                                $prefixLen-1]
    return $newPrefix[string range $s $prefixLen end]
}

# If this is the main script, run the tests.
if {[info exists argv0] && ([file tail [info script]] eq [file tail $argv0])} {
    package require tcltest
    namespace path ::lcwp
    if {$argv ne {}} {
        tcltest::configure -match $argv
    }

    tcltest::test common-word-prefix-1.1 {simple case} -body {        lcommon-word-prefix {hello world 1} {hello world 2}
    } -result {hello world}

    tcltest::test common-word-prefix-1.2 {all the same words} -body {        list [lcommon-word-prefix foo foo] \
             [lcommon-word-prefix {hello world} {hello world}]
    } -result {foo {hello world}}

    tcltest::test common-word-prefix-1.3 {all different words} -body {        list [lcommon-word-prefix foo bar] \
             [lcommon-word-prefix {foo bar} {baz qux}]
    } -result {{} {}}

    tcltest::test common-word-prefix-1.4 {words sharing a prefix} -body {        list [lcommon-word-prefix foo food] \
             [lcommon-word-prefix fool food] \
             [lcommon-word-prefix {hello world alpha} {hello world aleph}]
    } -result {{} {} {hello world}}

    tcltest::test common-word-prefix-1.5 {different length} -body {        list [lcommon-word-prefix {foo bar baz} foo] \
             [lcommon-word-prefix {foo bar baz} {foo bar}] \
             [lcommon-word-prefix {foo bar baz} {foo bar }] \
             [lcommon-word-prefix {foo bar } {foo bar baz}] \
             [lcommon-word-prefix {foo bar} {foo bar baz}] \
             [lcommon-word-prefix foo {foo bar baz}]
    } -result {foo {foo bar} {foo bar} {foo bar} {foo bar} foo}

    tcltest::test common-word-prefix-1.6 includeTailSep -body {        list [lcommon-word-prefix {hello world 1} {hello world 2} { } 0] \
             [lcommon-word-prefix {hello world 1} {hello world 2} { } 1] \
             [lcommon-word-prefix hello-world-1 hello-world-2 - 0] \
             [lcommon-word-prefix hello-world-1 hello-world-2 - 1]
    } -result {{hello world} {hello world } hello-world hello-world-}

    tcltest::test common-word-prefix-1.7 whitespace-1 -body {        list [lcommon-word-prefix {foo  bar   1} {foo  bar } { } 0] \
             [lcommon-word-prefix {foo  bar   1} {foo  bar } { } 1] \
             [lcommon-word-prefix {  foo  bar   1} {  foo  bar  } { } 0] \
             [lcommon-word-prefix {  foo  bar   1} {  foo  bar  } { } 1]
    } -result {{foo  bar} {foo  bar } {  foo  bar} {  foo  bar  }}

    tcltest::test replace-prefix-1.1 {default use} -body {
        list [replace-prefix {} {}] \
             [replace-prefix foo foo] \
             [replace-prefix {foo bar} {foo bar baz}]
    } -result {{} {   } {        baz}}

    tcltest::test replace-prefix-1.2 pattern -body {
        list [replace-prefix {} {} 12345] \
             [replace-prefix foo foo 12345] \
             [replace-prefix {foo bar } {foo bar baz} 12345]
    } -result {{} 123 12345123baz}

    # Exit with a nonzero status if there are failed tests.
    set failed [expr {$tcltest::numTests(Failed) > 0}]

    tcltest::cleanupTests
    if {$failed} {
        exit 1
    }
}
======


** Use example **

*** Code ***

======
source lcwp.tcl

proc log text {
    set message "[clock format [clock seconds] -gmt 1] $text"
    set prefix [::lcwp::longest-common-word-prefix $::prevLogMessage \
                                                   $message \
                                                   {\s} \
                                                   1]
    puts stderr [::lcwp::replace-prefix $prefix $message $::logPlaceholder]
    set ::prevLogMessage $message
}

foreach logPlaceholder {{ } -=} {
    set prevLogMessage {}
    log "frobnicating file /foo/bar"
    log "frobnicating file /foo/baz"
    log "frobnicating file /foo/qux"
}
======

*** Output ***

======none
Mon Mar 26 16:37:34 GMT 2018 frobnicating file /foo/bar
                                               /foo/baz
                                               /foo/qux
Mon Mar 26 16:37:34 GMT 2018 frobnicating file /foo/bar
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-/foo/baz
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-/foo/qux
======


<<categories>>Package | String Processing | Word and Text Processing