8000 fixing external postagging bug · ChatScript/ChatScript@2085d88 · GitHub
[go: up one dir, main page]

Skip to content

Commit 2085d88

Browse files
committed
fixing external postagging bug
1 parent 12ee59e commit 2085d88

File tree

8 files changed

+33
-28
lines changed

8 files changed

+33
-28
lines changed

SRC/english.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -696,7 +696,10 @@ uint64 GetPosData( int at, char* original,WORDP& revise, WORDP &entry,WORDP &can
696696
{ // this is not allowed to write properties/systemflags/internalbits if the word is preexisting
697697
uint64 properties = 0;
698698
sysflags = cansysflags = 0;
699-
canonical = 0;
699+
canonical = 0;
700+
if (at < 1) { ; } // not from sentence
701+
else if (canonicalLower[at]) canonical = canonicalLower[at]; // note canonicalLower may already be set by external postagging
702+
else if (canonicalUpper[at]) canonical = canonicalUpper[at]; // note canonicalUpper may already be set by external postagging
700703
entry = 0;
701704
if (start == 0) start = 1;
702705
if (revise) revise = NULL;
@@ -970,7 +973,7 @@ uint64 GetPosData( int at, char* original,WORDP& revise, WORDP &entry,WORDP &can
970973
if (participle && !strcmp(participle,original)) properties |= NOUN_ADJECTIVE;
971974
}
972975
WORDP canon = GetCanonical(entry);
973-
if (canon) canonical = canon;
976+
if (canon && !canonical) canonical = canon;
974977
if (canonical) cansysflags = canonical->systemFlags;
975978

976979
// german postag data marks all nouns without separating singular from plural

SRC/englishTagger.cpp

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -720,6 +720,7 @@ static void SetCanonicalValue(int start,int end)
720720
else ++lower;
721721
}
722722
bool caseSignificant = (lower > 3 && lower > upper);
723+
bool csEnglish = !stricmp(language, "english");
723724

724725
// now set canonical lowercase forms
725726
for (int i = start; i <= end; ++i)
@@ -734,7 +735,8 @@ static void SetCanonicalValue(int start,int end)
734735
WORDP D = FindWord(original);
735736
WORDP canon1 = (D) ? GetCanonical(D) : NULL;
736737
char* canon = (canon1) ? canon1->word : NULL;
737-
if (posValues[i] & (DETERMINER| IDIOM) && original[1] == 0) // treat "a" as not a letter A
738+
739+
if (csEnglish && posValues[i] & (DETERMINER| IDIOM) && original[1] == 0) // treat "a" as not a letter A
738740
{
739741
canon = NULL;
740742
canonicalLower[i] = originalLower[i];
@@ -746,7 +748,7 @@ static void SetCanonicalValue(int start,int end)
746748
canonicalLower[i] = originalLower[i];
747749
continue;
748750
}
749-
else if (allOriginalWordBits[i] & CONJUNCTION )
751+
else if (csEnglish && allOriginalWordBits[i] & CONJUNCTION )
750752
{
751753
if (!stricmp(wordStarts[i], "times")) // a conjunction looking like plural that in singular is a normal word
752754
{
@@ -757,19 +759,19 @@ static void SetCanonicalValue(int start,int end)
757759
}
758760

759761
// a word like "won" has noun, verb, adjective meanings. We prefer a canonical that's different from the original
760-
if (canon && IsUpperCase(*canon)) canonicalUpper[i] = FindWord(canon);
761-
else if (canon) canonicalLower[i] = FindWord(canon);
762+
if (csEnglish && canon && IsUpperCase(*canon)) canonicalUpper[i] = FindWord(canon);
763+
else if (csEnglish && canon) canonicalLower[i] = FindWord(canon);
762764
else if (pos & NUMBER_BITS); // must occur before verbs and nouns, since "second" is a verb and a noun
763765
else if (canonicalLower[i] && canonicalLower[i]->properties & (NOUN_NUMBER|ADJECTIVE_NUMBER)); // dont change canonical numbers like December second
764-
else if (allOriginalWordBits[i] & NOUN_GERUND) // because singing is a dict word, we might prefer noun over gerund. We shouldned
766+
else if (csEnglish && allOriginalWordBits[i] & NOUN_GERUND) // because singing is a dict word, we might prefer noun over gerund. We shouldned
765767
{
766768
canonicalLower[i] = FindWord(GetInfinitive(original,false));
767769
}
768-
else if (pos & (VERB_BITS | NOUN_GERUND | NOUN_INFINITIVE|ADJECTIVE_PARTICIPLE) )
770+
else if (csEnglish && pos & (VERB_BITS | NOUN_GERUND | NOUN_INFINITIVE|ADJECTIVE_PARTICIPLE) )
769771
{
770772
canonicalLower[i] = FindWord(GetInfinitive(original,false));
771773
}
772-
else if (pos & ADJECTIVE_NORMAL && !(D && D->properties & (MORE_FORM|MOST_FORM)))
774+
else if (csEnglish && pos & ADJECTIVE_NORMAL && !(D && D->properties & (MORE_FORM|MOST_FORM)))
773775
{
774776
canonicalLower[i] = originalLower[i]; // "his *fixed view should be adjective and not participle given it is an adjective- arbitrary
775777
if (allOriginalWordBits[i] & ADJECTIVE_PARTICIPLE)
@@ -778,7 +780,7 @@ static void SetCanonicalValue(int start,int end)
778780
if (verb) canonicalLower[i] = FindWord(verb);
779781
}
780782
}
781-
else if (pos & (NOUN_BITS - NOUN_GERUND - NOUN_ADJECTIVE) || (canonicalLower[i] && !stricmp(canonicalLower[i]->word,original)))
783+
else if (csEnglish && pos & (NOUN_BITS - NOUN_GERUND - NOUN_ADJECTIVE) || (canonicalLower[i] && !stricmp(canonicalLower[i]->word,original)))
782784
{
783785
if (pos & (NOUN_PROPER_SINGULAR|NOUN_PROPER_PLURAL) && canonicalUpper[i] && canonicalUpper[i]->properties & NOUN) // can it be upper case interpretation?
784786
{
@@ -793,17 +795,17 @@ static void SetCanonicalValue(int start,int end)
793795
}
794796
if (canonicalLower[i] && canonicalLower[i]->properties & (DETERMINER|NUMBER_BITS));
795797
else if (IsAlphaUTF8(*original) && canonicalLower[i] && !strcmp(canonicalLower[i]->word,(char*)"unknown-word")); // keep unknown-ness
796-
else if (pos & NOUN_BITS && !canonicalUpper[i])
798+
else if (csEnglish && pos & NOUN_BITS && !canonicalUpper[i])
797799
{
798800
char* noun = GetSingularNoun(original,false,true);
799801
if (noun) canonicalLower[i] = FindWord(noun);
800802
}
801-
else if (D && D->internalBits & UPPERCASE_HASH && FindWord(original, 0,LOWERCASE_LOOKUP))
803+
else if (csEnglish && D && D->internalBits & UPPERCASE_HASH && FindWord(original, 0,LOWERCASE_LOOKUP))
802804
{
803805
canonicalLower[i] = FindWord(original,0, LOWERCASE_LOOKUP);
804806
}
805807
}
806-
else if (pos & (ADJECTIVE_BITS - ADJECTIVE_PARTICIPLE - ADJECTIVE_NOUN) || (canonicalLower[i] && !stricmp(canonicalLower[i]->word,original)))
808+
else if (csEnglish && pos & (ADJECTIVE_BITS - ADJECTIVE_PARTICIPLE - ADJECTIVE_NOUN) || (canonicalLower[i] && !stricmp(canonicalLower[i]->word,original)))
807809
{
808810
if (canonicalLower[i] && canonicalLower[i]->properties & NUMBER_BITS);
809811
else
@@ -813,13 +815,13 @@ static void SetCanonicalValue(int start,int end)
813815
}
814816

815817
// for adjectives that are verbs, like married, go canonical to the verb if adjective is unchanged
816-
if (canonicalLower[i] && !strcmp(canonicalLower[i]->word,original))
818+
if (csEnglish && canonicalLower[i] && !strcmp(canonicalLower[i]->word,original))
817819
{
818820
char* infinitive = GetInfinitive(original,false);
819821
if (infinitive) canonicalLower[i] = FindWord(infinitive);
820822
}
821823
}
822-
else if (pos & ADJECTIVE_NOUN)
824+
else if (csEnglish && pos & ADJECTIVE_NOUN)
823825
{
824826
if (canonicalLower[i] && canonicalLower[i]->properties & NUMBER_BITS);
825827
else if (IsUpperCase(*wordStarts[i]) && caseSignificant) {;} // upper case is intentional
@@ -829,7 +831,7 @@ static void SetCanonicalValue(int start,int end)
829831
if (adj) canonicalLower[i] = FindWord(adj);
830832
}
831833
}
832-
else if (pos & ADVERB || (canonicalLower[i] && !stricmp(canonicalLower[i]->word,original)))
834+
else if (csEnglish && pos & ADVERB || (canonicalLower[i] && !stricmp(canonicalLower[i]->word,original)))
833835
{
834836
if (canonicalLower[i] && canonicalLower[i]->properties & NUMBER_BITS);
835837
else canonicalLower[i] = FindWord(GetAdverbBase(original,false));
@@ -844,14 +846,14 @@ static void SetCanonicalValue(int start,int end)
844846
else if (*original == '~') canonicalLower[i] = FindWord(original);
845847
else if (!IsAlphaUTF8(*original)) canonicalLower[i] = FindWord(original);
846848

847-
if (pos & PRONOUN_BITS && !stricmp(original,(char*)"one")) // make it a number
849+
if (csEnglish && pos & PRONOUN_BITS && !stricmp(original,(char*)"one")) // make it a number
848850
{
849851
canonicalLower[i] = StoreWord((char*)"1",NOUN|NOUN_NUMBER);
850852
}
851853

852854
// handle composite verb canonical for single hypen case
853855
char* hyphen = strchr(original,'-');
854-
if (hyphen && pos & (VERB_BITS|NOUN_GERUND|ADJECTIVE_PARTICIPLE|NOUN_INFINITIVE)) // find the verb root.
856+
if (csEnglish && hyphen && pos & (VERB_BITS|NOUN_GERUND|ADJECTIVE_PARTICIPLE|NOUN_INFINITIVE)) // find the verb root.
855857
{
856858
char word[MAX_WORD_SIZE];
857859
strcpy(word,original);
@@ -881,7 +883,7 @@ static void SetCanonicalValue(int start,int end)
881883
else canonicalLower[i] = can;
882884
}
883885
if (canonicalLower[i] && IsDigit(*canonicalLower[i]->word)) wordCanonical[i] = canonicalLower[i]->word; // leave numbers alone
884-
else if (canonicalLower[i] && originalLower[i])
886+
else if (csEnglish && canonicalLower[i] && originalLower[i])
885887
{
886888
if (!GetCanonical(originalLower[i]) && posValues[i] & NOUN_SINGULAR && !(allOriginalWordBits[i] & NOUN_GERUND) && stricmp(canonicalLower[i]->word,(char*)"unknown-word")) // saw does not become see, it stays original - but singing should still be sing and "what do you think of dafatgat" should remain
887889
{
@@ -893,7 +895,7 @@ static void SetCanonicalValue(int start,int end)
893895
else if (canonicalUpper[i]) wordCanonical[i] = canonicalUpper[i]->word;
894896
else wordCanonical[i] = wordStarts[i];
895897
}
896-
SetSentenceTense(start,end);
898+
if (csEnglish) SetSentenceTense(start,end);
897899
}
898900

899901
static char* PosBits(uint64 bits, char* buff)
@@ -1387,7 +1389,6 @@ void TagIt() // get the set of all possible tags. Parse if one can to reduce thi
13871389
if (!externalTagger && *GetUserVariable((char*)"$cs_externaltag"))
13881390
{
13891391
// not treetagger, just a named topic
1390-
externalTagger = 1;
13911392
OnceCode((char*)"$cs_externaltag");
13921393
}
13931394

SRC/functionExecute.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3132,7 +3132,7 @@ static FunctionResult SetCanonCode(char* buffer)
31323132
int n = atoi(arg1);
31333133
if (n < 1 || n > wordCount) return FAILRULE_BIT;
31343134
char* arg2 = ARGUMENT(2);
3135-
WORDP D = StoreWord(arg2);
3135+
WORDP D = StoreWord(arg2,AS_IS);
31363136
wordCanonical[n] = D->word;
31373137
if (!IsUpperCase(*wordCanonical[n]))
31383138
{

SRC/mainSystem.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2967,7 +2967,7 @@ void OnceCode(const char* var,char* function) // run before doing any of his i
29672967
if (BlockedBotAccess(topicid) || GAMBIT_MAX(block->topicMaxRule) == 0)
29682968
{
29692969
char word[MAX_WORD_SIZE];
2970-
sprintf(word,"There are no gambits in topic %s for %s.",GetTopicName(topicid),var);
2970+
sprintf(word,"There are no gambits in topic %s for %s or topic is blocked for this bot.",GetTopicName(topicid),var);
29712971
AddResponse(word,0);
29722972
ChangeDepth(-1,name);
29732973
return;

SRC/scriptCompile.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4248,8 +4248,8 @@ char* ReadOutput(bool optionalBrace,bool nested,char* ptr, FILE* in,char* &mydat
42484248
}
42494249
if (*word == 'a' && word[2] == 0 && (word[1] == ';' || word[1] == '"' || word[1] == '\'' ) )
42504250
WARNSCRIPT((char*)"Is %s supposed to be a rejoinder marker?\r\n",word,currentFilename);
4251-
4252-
if ((*word == '}' && level == 0) || TopLevelUnit(word) || TopLevelRule(lowercaseForm) || Rejoinder(lowercaseForm) || !stricmp(word,(char*)"datum:")) // responder definition ends when another major unit or top level responder starts
4251+
if (*word == '}' && level == 0 && !optionalBrace) BADSCRIPT("extra } closing nothing")
4252+
if ((*word == '}' && level == 0 && optionalBrace) || TopLevelUnit(word) || TopLevelRule(lowercaseForm) || Rejoinder(lowercaseForm) || !stricmp(word,(char*)"datum:")) // responder definition ends when another major unit or top level responder starts
42534253
{
42544254
if (*word != ':') // allow commands here
42554255
{
@@ -4690,7 +4690,7 @@ Then one of 3 kinds of character:
46904690
// word is a rejoinder type
46914691
strcpy(kind,lowercaseForm);
46924692
}
4693-
else ReportBug((char*)"unexpected word in ReadTopLevelRule - %s",word)
4693+
else ReportBug((char*)"Prior script not complete- unexpected top level word %s after seeing %s", lowercaseForm, data - 20)
46944694
}
46954695

46964696
// did he forget to fill in any [] jumps

SRC/topicSystem.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2292,7 +2292,7 @@ static void LoadTopicData(const char* fname,const char* layerid,unsigned int bui
22922292
}
22932293
else
22942294
{
2295-
ptr = ReadInt(ptr,block->topicFlags);
2295+
ptr = ReadInt(ptr,block->topicFlags); //0x19 111423313 1 0 65 simpletopic.top
22962296
if (block->topicFlags & TOPIC_SHARE) shared = true; // need more data written into USER zone
22972297
ptr = ReadInt(ptr,block->topicChecksum);
22982298
ptr = ReadInt(ptr,topLevelRules);

WIKI/ChatScript-System-Variables-and-Engine-defined-Concepts.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,7 @@ setting them.
266266
| `%day` |Sunday, etc
267267
| `%daynumber` | 1-7 where 1 = Sunday
268268
| `%fulltime` | seconds representing the current time and date (Unix epoch time)
269+
| `%fullmstime` | Numeric full time/date in milliseconds (Unix epoch time)
269270
| `%hour` | 0-23
270271
| `%timenumbers` | completely consistent full time info in numbers that you can do <br>`_0 = ^burst(%timenumbers)`to get `_0` =seconds (2digit) <br>`_1`=minutes (2digit) <br>`_2`=hours (2digit) <br>`_3`=dayinweek(0-6 Sunday=0) <br>`_4`=dateinmonth (1-31) <br>`_5`=month(0-11 January=0) <br>`_6`=year.<br>You need to get it simultaneously if you want to do accurate things with current time, since retrieving %hour %minute separately allows time to change between calls
271272
| `%leapyear` | boolean if current year is a leap year

WIKI/ESOTERIC-CHATSCRIPT/ChatScript-PosParser.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/
342342
The German bot assumes you have installed TreeTagger and the german data per instructions provided
343343
on his website. I have only tried the Windows installation.
344344

345-
To use a foreign tagger, you set the `$cs_externaltag` variable in your bot definition to a topic that will
345+
To use a foreign tagger, you set the `$cs_externaltag` variable in your bot definition to a topic (called in Gambit mode) that will
346346
perform the work. And you need to disable ChatScript from performing the work. Make your bot
347347
definition `$cs_token` NOT use the following: `#DO_SPELLCHECK`, `#DO_PARSE`,
348348
`#DO_SUBSTITUTE_SYSTEM` (since that is english substitutions and punctuation processing).

0 commit comments

Comments
 (0)
0