Interactive PowerBasic Forum

IT-Consultant: Charles Pegge => OxygenBasic Examples => Topic started by: Zlatko Vid on December 01, 2023, 06:55:42 PM

Title: Function strMatchPattern
Post by: Zlatko Vid on December 01, 2023, 06:55:42 PM
Function strMatchPattern recently posted by Theo generated by AI
do i made some stupid mistakes or i forget or something
else ..this function need array or not
in original use continue inside While loop i never saw before
is that normal ..program simply crush ?

my try:
' Simple pattern matching function
'global vars
#lookahead
Dim in[500] as string
Dim patt[500] as string
'fn
Function strMatchPattern(s[500] as string, pattern[500] as string) as bool
    Dim sPos as int  : sPos = 1
    Dim pPos as int  : pPos = 1
    Dim sLen as int  : slen = len(s[])
    Dim pLen as int  : pLen = len(pattern[])
    Dim match as int : match = 0
    Dim star as int  : star = 0

    While sPos <= sLen
        if pPos <= pLen and (pattern[pPos] = s[sPos] or pattern[pPos] = "?") then
            pPos = pPos + 1
            sPos = sPos + 1
            goto cont
        end if

        if pPos <= pLen and pattern[pPos] = "*" then
            star = pPos
            match = sPos
            pPos = pPos + 1
           goto cont
        end if

        if star <> 0 then
            pPos = star + 1
            match += 1
            sPos = match
            goto cont
        end if

        return false
    Wend

'label
cont:

    While pPos <= pLen and pattern[pPos] = "*"
        pPos = pPos + 1
    Wend
    'fn return boolean TRUE/FALSE
    Return pPos > pLen
End Function

'call function...
in[1] = "testing" : patt[1]="test*"
print strMatchPattern( in[1] , patt[1] )



Title: Re: Function strMatchPattern
Post by: Zlatko Vid on December 02, 2023, 05:20:12 PM
so nobody respond ...looks like a stupid question or what ?
Title: Re: Function strMatchPattern
Post by: Charles Pegge on December 02, 2023, 05:33:01 PM
It is an interesting algorithm with hidden complexity. Here is one of my code serpents. :) It is in the form of a instr, and as you can see, it is considerably more complex than this AI version:

'01/12/2023 CP

'01/12/2023
'
function instrp(int i,string s,p,int*en=0) as int
=================================================
'
'i   start position
's   string to search
'p   search string (with optional ? * wildcards
'en  boundary of found key string
'
if i<1
  i=1
endif
if not s
  return 0 'no source string
endif
if not p
  return 1 'no search string
endif
'
indexbase 1
byte bs at strptr(s) 'source string bytes
byte bp at strptr(p) 'search string bytes
int js=i      'source string position
int jp=1      'search string position
int ks=i      'next search start
int kt=i      'current search start
int kf=0      'prev key wild card position
int kg=0      'prev source wild card position
int fs=i      'possible found position
int ls=len(s) 'limit of source string
int lp=len(p) 'limit of search string
int ok=1      'vrification flag
byte ka=bp[1] 'first byte to search
do
  if jp>lp
    exit do
  endif
  if js>ls
    if jp<=lp
      ok=0
    endif
    exit do
  endif
  select bp[jp]
  case 42 '*
    ok=1
    while bp[jp]=42
      jp++
    wend
    do
      if js>ls 'end of string
        ok=1
        jp--
        exit do
      endif
      if ka=bs[js]
        ks=js
      endif
      if jp>lp 'end of pattern
        ok=1
        jp--
        exit do
      endif
      if bp[jp]=bs[js] 'possible match char
        ok=1
        jp--
        'these flags used for resuming wild card
        kf=jp
        kg=js+1
        exit do
      endif
      if bp[jp]=63 'possible match ? char
        ok=1
        jp--
        exit do
      endif
      js++
    loop
  case 63 '?
    if ka=bs[js]
      ks=js
    endif
    ok=1
    js++
  case else 'all other chars
    if bp[jp]=bs[js] 'byte match
      ok=1
      if ka=bs[js]
        ks=js
      endif
      js++
    else 'mismatch
      ok=0
      if kf
        'prev wildcard positions
        jp=kf-1
        js=kg
        kf=0
        kg=0
      else
        if ka=63
          ks=kt '?' was wildcard start
        endif
         if ks=kt
           ks++ 'nudge fwd
        endif
        kt=ks
        js=ks 'restart search
        fs=js
        jp=0
      endif
    endif
  end select
  jp++ 'next pattern byte
loop
if ok
  function=fs
else
  js=0
endif
en=js
end function
'
'test cases
===========
string s="aabcdefghijklmmnopqrstuvwxyz"
'print instrp 1,s,"ab"
'print instrp 1,s,"abz"
'print instrp 1,s,"xyz"
'print instrp 1,s,"mno"
'print instrp 1,s,"mm?o"
'print instrp 1,s,"m?o"
'print instrp 1,s,"m??"
'print instrp 1,s,"m??p"
'print instrp 1,s,"??m"
'print instrp 1,s,"*m"
'print instrp 1,s,"m*"
'print instrp 1,s,"m*z"
'print instrp 1,s,"m*k*z"
'print instrp 1,s,"m*n"
'print instrp 1,s,"a*b"
'print instrp 1,s,"a?*c"
'print instrp 1,s,"c?d"
'print instrp 1,s,"c*d"
'print instrp 1,s,"c?*e"
'print instrp 1,s,"c**f"
'print instrp 1,s,"c*??f" 'yes
'print instrp 1,s,"c*??g" 'no
'print instrp 1,s,"c??*g" 'yes
int en
'print instrp 1,s,"c??*g",en 'yes
'print en
uses console
s="abcdefghijklmmabcdefghijklmn"
int b
b=instrp(1,s,"d*lmm",en)
print b " - " en cr
print mid(s,b,en-b) cr
print instrp(1,s,"d*lmn",en) " - " en cr
print cr "ok" cr
wait
Title: Re: Function strMatchPattern
Post by: Theo Gottwald on December 04, 2023, 12:51:23 PM
Optimizations and Changes:

Wildcard Handling: Optimized the handling of the * wildcard by skipping consecutive *s and efficiently searching for the next character in the pattern.

Redundant Checks: Removed redundant length checks and streamlined the handling of edge cases like empty strings or patterns.

Character Matching Logic: Direct comparison of bytes from source and pattern strings to enhance performance.
Memory Optimization: Reduced the number of variables and their scope where possible.

Efficient Data Structures: Utilized direct memory access (byte bs at strptr(s), byte bp at strptr(p)) for potentially more efficient string processing, depending on language support.

Notes:
The code is written in a style similar to the original, assuming a Basic-like language. The actual optimization may vary based on the specific language's features and capabilities.
Profiling and testing are recommended to ensure the optimized code performs as expected and to validate improvements.


function instrp(int i, string s, p, int* en=0) as int
  ' Initialize variables
  if i < 1 then i = 1
  if not s then return 0 ' No source string
  if not p then return 1 ' No search string

  ' Pre-calculate string lengths to avoid redundant calls
  int ls = len(s)
  int lp = len(p)
  if ls = 0 or lp = 0 then return 0 ' Handle empty strings

  ' Using more efficient direct memory access (if supported by the language)
  byte bs at strptr(s)
  byte bp at strptr(p)

  ' Optimization: Preprocessing for first character of pattern
  byte ka = bp[1]
  int js = i, jp = 1, ks = i, kt = i
  int fs = 0, kf = 0, kg = 0, ok = 0

  ' Main search loop
  while js <= ls and jp <= lp
    select bp[jp]
    case 42 ' * Wildcard
      ' Skip consecutive * characters
      do while jp < lp and bp[jp+1] = 42
        jp += 1
      loop

      ' Special case: * at the end of pattern
      if jp = lp then return ks

      ' Find the next character in source string
      jp += 1
      ka = bp[jp]
      do while js <= ls and bs[js] <> ka
        js += 1
      loop
      if js > ls then return 0

      kf = jp
      kg = js

    case 63 ' ? Wildcard
      if js > ls then return 0
      if ka = bs[js] then ks = js
      js += 1

    case else ' Regular character
      if bp[jp] <> bs[js] then
        ' Mismatch, use previous wildcard positions if available
        if kf > 0 then
          jp = kf
          js = kg
          kf = 0
          kg = 0
          continue while
        else
          ' No previous wildcard, restart search
          jp = 1
          ks += 1
          js = ks
          if js > ls then return 0
        endif
      else
        ' Character match
        if fs = 0 then fs = js ' First match position
        js += 1
      endif
    end select
    jp += 1
  wend

  ' Check for match completion
  if jp > lp then
    en = js - 1
    return fs
  else
    return 0
  endif
end function

Title: Re: Function strMatchPattern
Post by: Charles Pegge on December 04, 2023, 02:29:47 PM
Interesting.

At first glance, it understands the algorithm well, though the optimization of early termination on an end wild-card would be incorrect for the purposes of the instrp returning a complete match-string. The en variable should also return the position following the last character of the match. My variable annotations were also omitted, with the variables stacked with colons, thereby less easy to  read.
Title: Re: Function strMatchPattern
Post by: Pierre Bellisle on December 11, 2023, 11:30:50 PM
In PB, I use the best one from Wildcard matching algo (https://xoomer.virgilio.it/acantato/dev/wildcard/wildmatch.html#c_cpp_userjournal_algo)
Much faster than PathMatchSpec().
The ones you guys have seems real nice too...
Title: Re: Function strMatchPattern
Post by: Zlatko Vid on December 12, 2023, 08:27:51 AM
Charles

Maybe is your more complex but looking good to me
and i don't like using while-continue ugly construction
btw AI program simply crush,so not work for me.

I think that pattern ..if we called "pattern" with same symbol like "*"
is to use some sort of lexer...but maybe i am wrong.
Title: Re: Function strMatchPattern
Post by: Charles Pegge on December 13, 2023, 07:35:18 AM
This is a sleeker version of my previous code-snake, that uses pointers instead of arrays to improve performance, and some redundant/repetitive lines have been removed:

'CP
'01/12/2023
'13/12/2023

uses console

'using pointers and null terminators
'case insensitive
'accepts char* and strings
'

indexbase 0
'mapping lowercase to uppercase
dim byte UpperCase[255]
int i
for i=0 to 255
  UpperCase[i]=i
next
for i=0x60 to 0x7a
  UpperCase[i]-=32
next

function instrp(int i,char *s,*p, int *en=0) as int
===================================================
'
'i   start position
's   string to search
'p   search string (with optional ? * wildcards
'en  boundary of found key string
'
en=0
'
if i<1
  i=1
endif
if not s
  return 0 'no source string
endif
if not p
  return 1 'no search string
endif
'
byte js at strptr(s) 'source string bytes
@js+=i-1             'instrp offset
byte jp at strptr(p) 'search string bytes
byte ks at @js       'next search start
byte kt at @js       'current search start
byte *kf             'prev key wild card position
byte *kg             'prev source wild card position
int fs=0             'found index in s
int star             'wildcard flag

ScanBytes:
==========
do
'
if jp=0
  if star=0
    en=@js-strptr(s)+1
    return @kt-strptr(s)+1 ' 'success
  endif
elseif js=0
  return 0 'fail
endif
if ks=js
  @ks=@js 'update restart pos
endif
while jp=42
  @jp++
  star=1
wend
byte *us,*up
@us=@UpperCase+js
@up=@UpperCase+jp
if (us=up) or (jp=63) 'possible match char
  @jp++
  @js++
  if star
    'these pointers used for resuming wild card
    @kf=@jp-2
    @kg=@js
    if js=0
      @js--
    endif
    star=0
  endif
  continue do
endif 'match
if star 'no need to match
  @js++ 'next char
  continue do
endif
'mismatching chars
if @kf
  'prev wildcard positions
  @jp=@kf
  @js=@kg
  @kf=0
  @kg=0
  continue do
endif
if ks=63
  @ks=@kt '?' was wildcard start
endif
if @ks=@kt
  @ks++ 'nudge fwd
endif
@kt=@ks
@js=@ks 'restart search
@jp=strptr(p) 'reset pattern
'
end do
end function
'
'test cases
===========
int en
def pr
  output %1 + tab + str(instrp(1,s,%1,en)) + tab + str(en) + cr
end def
string s="aabcdefghijklmmnopqrstuvwxyz"
pr "a"
pr "aa"
pr "ab"
pr "abz"
pr "xyz"
pr "mno"
print "----------" cr
pr "m?o"
pr "m??"
pr "m??p"
pr "??m"
print "----------" cr
pr "*m"
pr "m*"
pr "m*z"
pr "m*k*z"
pr "k*m*z"
pr "m*n"
pr "a*b"
print "----------" cr
pr "a?*c"
pr "c?d"
pr "c*d"
pr "c?*e"
pr "c**f"
pr "c*??f"
pr "c*??g"
pr "c??*g"
print "----------" cr
int en
print str(instrp(1,s,"c??*g", en)) cr
print str(en) cr

print "----------" cr

s="abcdefghijklmmabcdefghijklmn"
int b
b=instrp(1,s,"d*lmm",en)
print str(b) " - " str(en) cr
print mid(s,b,en-b) cr
print str(instrp(1,s,"d*lmn",en)) " - " str(en) cr
print cr "ok" cr
wait

PS: correction to patterns ending with '*'