-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathEnDeRE.js
1799 lines (1729 loc) · 66.5 KB
/
EnDeRE.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* ========================================================================= //
// vi: set ts=4:
// vim: set ts=4:
#? NAME
#? EnDeRE.js - functions for parsing regular expressions
#
# ToDo: just a simple parser implemented, does not check
# - identify greedy/lazy quantifiers
# - behaviour of . and $
# - improve 'leading' and 'trailing' texts
# - special behavours of selected language (like version dependence)
# - take care about double escaped characters (see y below)
#?
#? SYNOPSIS
#? <SCRIPT language="JavaScript1.3" type="text/javascript" src="EnDeREMap.js"></SCRIPT>
#? <SCRIPT language="JavaScript1.3" type="text/javascript" src="EnDeRE.js"></SCRIPT>
#?
#? Additional for GUI:
#? <SCRIPT language="JavaScript1.5" type="text/javascript" src="EnDeGUI.js"></SCRIPT>
#? <SCRIPT language="JavaScript1.5" type="text/javascript" src="EnDeREGUI.js"></SCRIPT>
#?
# ? Additional for testing:
# ? <SCRIPT language="JavaScript1.5" type="text/javascript" src="EnDeTest.js"></SCRIPT>
# ?
#? DESCRIPTION
#? Functions for testing, analysing and displaying regular expressions.
#?
#? EnDeRE.parse() - parse given source as regular expression
#? EnDeRE.match() - try to match regular expression against
#? given source (text)
#? EnDeRE.lang() - return RegEx engine type for given language
#?
# Internal functions:
# EnDeRE.parseInit() - convert an array to a hash
# EnDeRE.arr2hash() - convert an array to a hash
# EnDeRE.explain() - add description if meta character
# EnDeRE.level() - check for identation grouping, class, etc)
# EnDeRE.parse._checkmeta() - does the nasty things/checks
#?
#? WHAT IT IS NOT
#? These functions are not
#? - test for RegEx syntax (lint or alike)
#? - converter for RegEx from to another syntax (flavour)
#? The result, either just pretty printed or with description text, is
#? not always reversible to the original given RegEx (because of added
#? blank, space, tab and/or newline characters).
#?
# HACKER's INFO
# The functions and methods found herein are used to parse and explain
# regular expressions (RegEx for now) in various flavours.
# So we first have to explain the terminology used for RegEx:
# RegEx - the string of the regular expression itself
# text/string - the text the RegEx should match against
# flavour - implementation specific syntax of RegEx
# match - the match of an RegEx in a string
# character - character, byte or bytes representing a letter
# literal - a character or string used as shown
# metacharacter - character with special behaviour, in fact the
# the oposite of a literal
# metasequence - sequence of characters which build a special
# behaviour like metacharacters
# escape - a sequence of characters, either to make it a
# metacharacter or to treat a metacharacter as
# literal, i.g. \ is used to escape characters
# class - a character class (enclosed in [ ] usually)
# subexpression - part of an expresseion or an expression within
# another expression
# grouping - grouping alternate text literals
# quantifier - metacharacter to specify amount of matches
# interval - min. and max. amount of matches (quantifier)
# backreferences - "remember" matches, in particular groups
# modifiers - metasequences to turn follwoing characters in
# RegEx to literals
# ...
# Following terms are used interchangable (as most literature does:)
# string - text - character literal
# meta - metacharacter - metasequence
# grouping - capturing
# modifiers - RegEx literals
# quantifier - interval
#
# The algorithm used to parse and explain the given RegEx according
# the specified flavour is as follows:
#
# Step0 - (to be found in EnDeREMap.js)
# A (hash) table with all known and available behaviours
# will be defined for each flavour.
# For better maintanance and because we ran out of uniqe
# keys (for example d is DEL control character and also
# digit class) the table is organized in sevaral smaller
# tables (see chrs.ctrl[], .meta[], .clss[] etc. ).
# This static definition is done once using JavaScript's
# (ECMA) prototopy functionality, aka JSON.
# Note to future hackers:
# instead of using several of smaller tables, we could
# have used one big three dimensional array like:
# chararray['flavour']['meta-character'] = magic;
# but that's harder to maintain.
# You also my initializing such an array at startup ..
# Step1 - For faster/better access, extract the definitions from
# Step0 according the given RegEx flavour to a new hash,
# where the hash key is the (literal) character itself.
# Note to hackers: this is a contribution to Step0.
# Step2 - Build an array according the given RegEx which marks
# all meta or special characters as defined in the hash
# table (see Step0, Step1), and also marks all remaining
# (other) characters as literals.
# Step3 - Loop over the source and find start of RegEx.
# Detecting the RegEx string and knowing how it will be
# evaluated by the language/flavour depends on following
# - the language/flavour itself
# - if the language/flavour evaluates the string first
# and passes the result to the regex engine
# - raw RegEx mode selected in GUI (passed as parameter)
# As result the RegEx literal string is known for Step4.
# If the regEx string is evaluated as string first, then
# the result from Step2 needs to be modified according.
# Step4 - Loop over the source -the given RegEx- split the RegEx
# at functional units (groups) and insert descriptions
# for all non-literal characters.
# Step4a - ignore initial /
# Step4b - check for leading ^
# Step4c - check for end of RegEx
# Step4d - unused
# Step4e - check for escaped character
# Step4f - check for escape character
# Step4g - all other characters
# Step4h - special handling inside character classes []
# Step4t - collect trailing text after RegEx
#
# NOTE: this is a parser and hence ugly code by nature.
#
# Some very special behaviours are not implemented in the parser. See
# EnDeREMap.js for details (mainly marked with // ToDo: ).
#
# if (ccc=='indexOf') { continue; }
# This check inside 'for (key in array)' loops is a contribution
# to ancient Mozilla 1.x which has this property.
#?
#? REFERENCES
#? Following informations where used:
#? Mastering Regular Expressions (1st and 3rd Edition), J. Friedl
#? PCRE http://www.pcre.org/pcre.txt
#? MySQL 5.1 http://dev.mysql.com/doc/refman/5.1/en/regexp.html
# ? MySQL 5.1 http://dev.mysql.com/doc/refman/5.1/en/string-comparison-functions.html
# ? ==> REGEX and RLIKE is not multy-byte safe!
# ? MySQL 4.x is same as 5.x and 6.x according RegEx.
#?
#? VERSION
#? @(#) EnDeRE.js 3.7 12/12/08 09:22:18
#?
#? AUTHOR
#? 02-mar-08 Achim Hoffmann, mailto: EnDe (at) my (dash) stp (dot) net
#?
* ========================================================================= */
/* ToDo: more languages ..
* -----
C, C#, D, lua
PHP and some more need \\t in RegEx to become \t
VB.NET supports (?# .. ) as comment
escape/quote functions:
Perl: \Q .. \E ; q(), qq(), qr(), qx()
:NET: Regex.Escape()
PHP: preq_quote()
Java: quote()
*/
// ========================================================================= //
// public definition of all features and behaviours //
// ========================================================================= //
// == Step0 ==
/* ---- Description of the internal data structure see EnDeREMap.js */
// ========================================================================= //
// public EnDeRE object //
// ========================================================================= //
var EnDeRE = new function() { // already initializes in EnDeREMaps.js
this.SID = '3.7';
this.sid = function() { return('@(#) EnDeRE.js 3.7 12/12/08 09:22:18 EnDeRE'); };
// ======================================================================= //
// public and alias functions //
// ======================================================================= //
this.lang = function(src) {
//#? map language to RegEx engine type, return engine type
/* some languages are identical in the settings, they just differ in some
* minor behaviours (for example string parsing in C# versus VB.NET)
*/
if (EnDeTMP._lang.prototype[src] != undefined) {
// #dbx alert(EnDeTMP._lang.prototype[src]);
return(EnDeTMP._lang.prototype[src]);
}
return(src);
}; // lang
// ======================================================================= //
// global variables //
// ======================================================================= //
this.chrs = new EnDeTMP._chrs; // list of meta characters foreach language/flavour
this.desc = new EnDeTMP._desc; // description of control, class and meta characters
this.desc.clss = this.desc.meta; // both are the same
this.desc.quantifier = this.desc.meta; // both are the same
this.context = new EnDeTMP._context; // context of some meta and classes
this.rex = null; // this array is a copy of the parsed RegEx and
// holds the value accoding this.meta for each character
this.matches = null; // this array holds all matches in given text for RegEx
this.backref = null; // this array holds all backreferences in given text for RegEx
// index number corresponds to backreference (index 0 unused)
this.usexml = 0; // 1: output as XML, 0: output as text
// finally describe myself (as .prototype in EnDeREMap.js fails):
this.x = EnDeTMP.x; this.desc.idx[EnDeTMP.x] = EnDeTMP._x;
this.a = EnDeTMP.a; this.desc.idx[EnDeTMP.a] = EnDeTMP._a;
this.y = EnDeTMP.y; this.desc.idx[EnDeTMP.y] = EnDeTMP._y;
this.i = EnDeTMP.i; this.desc.idx[EnDeTMP.i] = EnDeTMP._i;
this.d = EnDeTMP.d; this.desc.idx[EnDeTMP.d] = EnDeTMP._d;
this.e = EnDeTMP.e; this.desc.idx[EnDeTMP.e] = EnDeTMP._e;
this.h = EnDeTMP.h; this.desc.idx[EnDeTMP.h] = EnDeTMP._h;
this.O = EnDeTMP.O; this.desc.idx[EnDeTMP.O] = EnDeTMP._O;
this.r = EnDeTMP.r; this.desc.idx[EnDeTMP.r] = EnDeTMP._r;
this.o = EnDeTMP.o; this.desc.idx[EnDeTMP.o] = EnDeTMP._o;
this.K = EnDeTMP.K; this.desc.idx[EnDeTMP.K] = EnDeTMP._K;
this.sample = '"\'/^group[[:digit:]]*?^(*LF)use linefeed(?:grp2(foo|bar)+e(?P<var>val)(?<push>value)[s\\nS]caped\\n%sed\\(escaped \\bgrouping\\)null\\0(_$QL(3\\s4)){1,3})(?-im:noCase)class[\\s[:alpha:]]?Python(?P=var)PCRE\\k<var>.NET\\k{var}(?<-pop>)invalid class[:punct:](Unicode properties:\\pL\\p{Me}\\p{^Other}\\P{InvertedScript}\\pN(?:Visual Studio Unicode Property:[:Lu:N:n:h]))EnDe$"\'/mg';
// sample used in (EnDe)GUI
this.map = new Array(); // map brackets
this.map['('] = ')';
this.map['['] = ']';
this.map['{'] = '}';
this.map['<'] = '>';
this.ident = 4;
// ToDo: ident NOT YET IMPLEMENTED
// ======================================================================= //
// RegEx functions //
// ======================================================================= //
this.out = function(tag,src) {
//#? return given data formated for output: plain text or XML
var kkk = '';
var anf = '<' + tag + '>';
var end = '</' + tag + '>';
switch (tag) {
case 'desc': kkk = '\t# '; break;
}
if (this.usexml == 1) {
return anf + src + end;
} else {
return kkk + src;
}
};
this.bracket = function(src) {
//#? return matching closing bracket for src, otherwise src itself
if (this.map[src] != undefined) { return(this.map[src]); }
return(src);
};
this.explain = function(src,goab,meta,desc) {
//#? return description of meta character, formated if required
/* src - the character or string to describe
* goab - the current state object of parsing
* meta - behaviours of src according language (see this.parse() Step1)
* desc - corresponding descriptions (for meta)
*/
var bux = '';
if ((meta != null) && (meta != undefined)) { // got something ..
switch (meta) {
case this.O: bux = goab.hold + src; break; // nothing to do
case this.o: bux = goab.hold + src; break; // ToDo: needs something special ...
case this.x: alert('explain: '+src);
default: // need formating
if (src != '') { // avoid some useless newlines ..
bux = '\n' + goab.ident + goab.hold + src;
}
if (goab.print == true) {
//bux += this.out('desc', desc);
bux += '\t# ' + desc;
if (meta == this.x) { bux += '\t' + this.desc.idx[this.x]; }
}
bux += '\n';
break;
}
if (goab.print != true) { bux += goab.ident; }
} else { // got nothing, return as is
bux = goab.hold + src;
}
return(bux);
}; // explain
this.property = function(p,src,goab) {
//#? return description for Unicode properties
/* p - initial character can be p or P
* will be P for \p{^..} also
* src - the property string literal
* goab - as usual ..
*/
// ToDo: some special Unicode properties
/*
'2' : 'supports longhand Unicode properties like \\p{Lu}',
'&' : 'supports composite property like \\p{L&}',
'P' : 'supports negated Unicode properties like \\P{L}',
'^' : 'supports negated Unicode properties like \\p{^L}',
's' : 'supports Unicode script properties',
'b' : 'supports Unicode block properties',
*/
var bux = '';
var kkk = '';
var block='';
var bbb = src.replace(/[{}\^:]/g,'');
if (bbb == src) { // nothing replaced, hence no {}
if (goab.prop['1'] <= this.O) { // not supported, just return
return(goab.hold + p + src);
}
}
if (goab.prop['N'] > this.O) {
kkk = bbb.match(/^In(.*)$/i);
if (kkk != null) {
bbb = kkk[1]; // simply strip it off
block = '(block longhand/pseudo-script) ';
}
}
if (goab.prop['S'] > this.O) {
kkk = bbb.match(/^Is(.*)$/i);
if (kkk != null) {
bbb = kkk[1]; // simply strip it off
block = '(block longhand/pseudo-script) ';
}
}
bux += this.explain(p, goab, goab.meta, goab.desc.meta[p]);
var ccc = '';
var desc= '** EnDeRE: unknown Unicode property (may be block or script)';
var hhh = '';
var c = '';
var map = []; // map description text
var ids = '{_-.L:'; // these are the letters used in this.chrs.prop['fuchur']
var j = '';
for (j in ids) {
/* loop through this.chrs.prop{} and build a regex combined of all
* entries found, depending on the letter (ids) the hash key or the
* hash value will be used and finally the _ in the value replaced
* by - a space or nothing
* This results in checking all combinations of for example:
* { _ - . L
* {Sm}, {Math_Symbol}, {Math-Symbol}, {Math Symbol}, {MathSymbol}
*
* As all of these strings (except the first 2) are not in goab.desc
* we need to build an addition map[] also. This map[] contains all
* the keys lowercase and the value as is (as defined in desc.prop).
*/
ccc = '';
if (goab.prop[ids[j]] > this.O) {
map.length = 0;
switch (ids[j]) {
case '{':
for (c in goab.desc.prop) { ccc += c + '|'; map[c.toLowerCase()] = goab.desc.prop[c]; };
break;
case ':':
if (goab.lang == ':VisualSt') { // ViasualSt has additional entries
/* Visual Studio Unicode properties are case insensitive */
for (c in goab.desc.prop) { ccc += c + '|'; map[c] = goab.desc.prop[c]; };
// add desc.propVS{} which may overwrite some defined descriptions
// ToDo: check for language needs to be removed here but done in parseInit()
for (c in goab.desc.propVS) { ccc += c + '|'; map[c] = goab.desc.propVS[c]; };
}
break;
case 'L':
case '_':
case '-':
case '.':
for (c in goab.desc.prop) {
kkk = goab.desc.prop[c];
switch (ids[j]) {
case '_': break; // strings are already with _
case '-': kkk = kkk.replace(/_/g, '-'); break;
case '.': kkk = kkk.replace(/_/g, ' '); break;
case 'L': kkk = kkk.replace(/_/g, ''); break;
}
ccc += kkk + '|';
map[kkk.toLowerCase()] = goab.desc.prop[c];
}
// ToDo: following should be in EnDeREMaps, somehow ...
if (goab.prop['+'] > this.O) { kkk = 'all'; ccc += kkk +'|'; map[kkk.toLowerCase()] = kkk; }
if (goab.prop['*'] > this.O) { kkk = 'Any'; ccc += kkk +'|'; map[kkk.toLowerCase()] = kkk; }
if (goab.prop['='] > this.O) { kkk = 'Assigned'; ccc += kkk +'|'; map[kkk.toLowerCase()] = kkk; }
if (goab.prop['!'] > this.O) { kkk = 'Unassigned'; ccc += kkk +'|'; map[kkk.toLowerCase()] = kkk; }
kkk = '';
break;
}
kkk = bbb.match(new RegExp('^(' + ccc + 'dummy)$', goab.ic));
if (kkk != null) {
if (goab.ic != '') {
desc= map[kkk[1].toLowerCase()];
} else {
desc= map[kkk[1]];
}
kkk = null;
break; // only one possibility as all strings are unique
}
} // ids[j] > this.O
} // loop all variants
ccc = '';
kkk = goab.ident;
hhh = goab.hold;
goab.ident += '\t';
goab.hold = '';
// ToDo: check for 's' and 'b' here if necessary (desc==**EnDeRE: ...)
bux += this.explain(src.replace(/:/,''), goab, 'K', block + desc); // : already printed
goab.ident = kkk;
goab.hold = hhh;
bux += goab.ident;
return(bux);
}; // property
this.modifier = function(src,goab) {
//#? return description for modifiers (one per line)
var bux = '';
var bbb = '';
var c = '';
var ccc = src.match(/^(?:[\/]|\(\?)?([a-zA-Z-]+)(?:[:)])?$/); // strict match to /../ or (?..)
if (ccc != null) {
for (c in ccc[1]) {
if (ccc[1][c] == '-') { bbb = 'negated '; continue; }
if (goab.modifier[ccc[1][c]] > this.O) {
if (goab.print == true) {
bux += goab.ident + '\t# ' + ccc[1][c] + ' : ' + bbb + goab.desc.modifier[ccc[1][c]] + '\n';
//bux += goab.ident + this.out('desc', ccc[1][c] + ' : ' + bbb + goab.desc.modifier[ccc[1][c]]) + '\n';
}
}
bbb = '';
}
}
return(bux);
}; // modifier
this.quantifier=function(src,goab) {
//#? return description for modifier literals, empty if not a literal
var bux = '';
var ccc = null;
var qqq = null;
for (qqq in goab.quantifier) { // loop over quantifiers
if (qqq==='indexOf') { continue; }
ccc = src.match(new RegExp('^(' + goab.quantifier[qqq].replace(/(.)/g, '\\$1') + ')'));
if (ccc!==null) {
// ToDo: 'K' not really correct here, replace by proper goab.xxx variable
bux = this.explain(ccc[1], goab, 'K', goab.desc.meta[src[0]] + '; '+ goab.desc.meta[goab.quantifier[qqq]]);
goab.skip = ccc[1].length - 1;
}
ccc = null;
}
return(bux);
}; // quantifier
this.level = function(src,goab,meta,typ) {
//#? return description and identation for braces etc.
/* src - the character or string to describe
* goab - the current state object of parsing
* meta - behaviours of src according language (see this.parse() Step1)
* (not used herein, but passed through to this.explain())
* typ - the type of meta character (see Step1)
*
* function modifies goab
*/
// ToDo: should become part of this.explain()
function _no (src) { if (src != '') { return(' (#' + src + ')' ); }; return(src); }
var bux = '';
var bbb = '';
var ccc = src[0];
var kkk = '';
var reg = null;
var br = 0; // 1 if backreferences need to be reset
var j = '';
if (goab.print == true) {
// take care about state (braces, parantheses, etc.)
if (typ != this.O) {
// kkk used for backreference counter
// ToDo: kkk does not work for closing (left) parantheses, hence we omit it for now
switch (ccc) {
case '(':
if (goab.isclass == true) { return(ccc); break; } // simple character inside character class
goab.backr++; kkk = goab.backr; // ToDo: no backr for non-capturing groups
goab.isgroup += typ;
bbb = ccc;
br = 0;
if (src[1] == '?') { // most (all?) '(?' are non-capturing, hence no backreference
if (src[2].match(/[<:!=#?>({]/) != null) { br = 1; }
}
// now we have to check our meta string literals starting with (
for (j in goab.literal.meta) {
if (j==='indexOf') { continue; }
if (goab.ctxmeta[src[2]] == this.O) { continue; } // not supported, hence nothing to do
// ToDo: need to check goab.ctxlook here
ccc = goab.literal.meta[j];
bux = src.substr(0,ccc.length);
if (ccc == bux) { // found meta string literal
switch (ccc) { // ugly hack to find the proper description
case '(k<': // ToDo: never reached as k is not prefixed (
br = 1; // ToDo: goab.backr is wrong backreference, need to find correct one
bbb = '(k< >';
break;
default: bbb = ccc; break;
}
if (br != 0) { // reset backreferences counts
goab.backr--;
kkk = '';
}
bux = this.explain(ccc, goab, meta, goab.desc.meta[bbb] + _no(kkk));
goab.skip = ccc.length - 1;
goab.ident += '\t';
if (goab.print == true) { bux += goab.ident; }
return(bux);
break; // never reached
}
} // all literals
// some special meta string literals
ugly: {
/*
* now we check special string literals which contain dynamic parts
* this works as follows:
* 1. check if given source matches the desired string literal
* 2. if it matches, set bbb to the key used in _desc.*{} and
* exit ugly scope
* the key for _desc.*{} is special, it must be known here
* keep in mind that the sequence for the matches is important,
* 'cause the first match wins
*/
// backtracking verbs
ccc = src.match(/^\(\*[A-Z]+\)/);
if ((ccc != null) && (goab.ctxmeta['*'] > this.O)) { br = 1; bbb = ccc; break ugly; }
// ToDo: need to check goab.ctxverb also
// named capture variable
ccc = src.match(/^\(\?\P<[a-zA-Z_]+\>/); // most common ..
if ((ccc != null) && (goab.ctxmeta['P'] > this.O)) { br = 1; bbb = '(?P< >'; break ugly; }
// ToDo: python fails 'cause of escaped \( and \)
ccc = src.match(/^\(\?P=[a-zA-Z_]+\)/); // python, PHP
if ((ccc != null) && (goab.ctxmeta['v'] > this.O)) { br = 1; bbb = '(?P=VAR'; break ugly; }
ccc = src.match(/^\(\?\<[a-zA-Z_]+\>/); // .NET only
if ((ccc != null) && (goab.ctxmeta['N'] > this.O)) { br = 1; bbb = '(?< >'; break ugly; }
ccc = src.match(/^\(\?\<-[a-zA-Z_]+\>/); // .NET only
if ((ccc != null) && (goab.ctxmeta['N'] > this.O)) { br = 1; bbb = '(?<- >'; break ugly; }
ccc = this.chrs.modifier.fuchur.replace(/\s*/g,'');
// ToDo: check if supported by flavour
if (goab.ctxmod[')'] != this.O) {
reg = new RegExp('\\(\\?[' + ccc + '-]+' + '\\)', ''); // mode modifier (?-ceimx)
ccc = src.match(reg);
reg = null;
if (ccc != null) { br = 2; bbb = '(?..)'; break ugly; }
}
ccc = this.chrs.modifier.fuchur.replace(/\s*/g,'');
if (goab.ctxmod[':'] != this.O) {
reg = new RegExp('\\(\\?[' + ccc + '-]+' + '\\:', ''); // mode modifier span (?-ceimx)
ccc = src.match(reg);
reg = null;
if (ccc != null) { br = 2; bbb = '(?..:'; break ugly; }
}
ccc = null;
} // ugly
if (br != 0) { // reset backreferences counts
goab.backr--;
kkk = '';
}
if (ccc != null) { // got something
// ToDo: lookbehind ctxmeta['b'] ctxmeta['b'] is not supported by all flavours, needs to be checked here
bux = this.explain(ccc[0], goab, meta, goab.desc.meta[bbb] + _no(kkk));
if (br == 2) { bux += this.modifier(ccc.toString(),goab); }
goab.skip = ccc[0].length - 1;
if (ccc.toString().match(/^\(\*[A-Z]+\)/) == null) { // no ident for backtracking verbs
goab.ident += '\t';
}
if (goab.print == true) { bux += goab.ident; }
bbb = null;
ccc = null;
return(bux);
}
// we reach here if no meta string literal found; now check specials
/*
ccc = src.substr(0,2)
if (ccc = '(?') {
kkk = src.match(new RegExp('\\(\\?[' + bbb + '-]+' + '\\)', '')); // (?-ceimx)
if (kkk != null) { // found mode modifier
bbb = ccc + '..' + ccc[1] + ']'; // '[....]' or '[=..=]'
ccc = kkk.toString(); // JavaScript is picky, need cast to String here!
}
}
*/
// no break; !
case ')': goab.isgroup += typ; break;
case '[':
if (goab.isclass == false) {
goab.isclass = true;
if (src[1] == '^') { // this is special
bbb = '[^';
bux = this.explain(bbb, goab, meta, goab.desc.meta[bbb]);
goab.skip = 1;
goab.ident += '\t';
bux += goab.ident;
return(bux);
}
break;
}
// already parsing a character class
// now we have to check our class string literals starting with [
for (j in goab.literal.clss) {
if (j==='indexOf') { continue; }
ccc = goab.literal.clss[j];
bux = src.substr(0,goab.literal.clss[j].length);
//alert('ccc:'+ccc+' bux:'+bux);
if (ccc == bux) { // found clss string literal
// first check if this flavour supports this literal
if (goab.ctxclss[ccc[1]] == this.O) { continue; } // ToDo: can we break here?
switch (ccc) { // ugly hack to find the proper description
case '[:':
case '[=':
case '[.':
// need to find the closing bracket '.]' or '=]';
bbb = ccc;
// ToDo: not sure if character equivalents [=x=] may have more than one character
// ToDo: not sure if multiple character equivalents [=x=] can occur inside [...],
// if not then following match must end with \]\]
kkk = src.match(new RegExp('\\' + ccc + '[a-zA-Z-]+' + '\\' + ccc[1] + '\\' + ']', ''));
if (kkk != null) { // found a POSIX special class
bbb = ccc + '..' + ccc[1] + ']'; // '[....]' or '[=..=]'
ccc = kkk.toString(); // JavaScript is picky, need cast to String here!
} else {
//kkk =''
//ccc = src;
bux = this.explain(ccc[0], goab, meta, '**WARNING: probably unescaped [ character inside character class **') + goab.ident;
return(bux);
break; // never reached
}
kkk = '';
break;
default: bbb = ccc; break;
}
if (goab.ctxclss[ccc[1]]===this.x) { kkk += '\t' + this.desc.idx[this.x]; }
bux = this.explain(ccc, goab, meta, goab.desc.meta[bbb] + _no(kkk));
goab.skip = ccc.length - 1;
if (goab.print == true) { bux += goab.ident; }
return(bux);
break; // never reached
}
}
// we reach here if it is not a class string
if ((goab.isclass == true) && (goab.isesc == false)) {
// ToDo: this check should be part of EnDeREMap.js (new column in EnDeTMP._context.class needed)
bux = this.explain('[', goab, meta, '**WARNING: unescaped [ character inside character class **');
return(bux);
}
break;
case ']':
if (goab.isclass == false) { return(ccc); }
goab.isclass= false;
break;
case '}':
if (goab.isrange == false) { return(ccc); }
goab.isrange= false;
break;
case '{': goab.isrange= true;
// check for quantifiers ranges
if (goab.quantifier.length > 0) {
bbb = this.quantifier(src, goab);
if (bbb != '') { // .. got something
return(bbb);
}
}
goab.isrange = false; // not a valid quantifier, just return
return(ccc);
break;
} // switch (ccc)
}
kkk = _no(kkk);
} // desc
bbb = null;
// continue here if we got a simple (single) meta
ccc = src[0];
switch (typ) {
case this.a :
bux = this.explain(ccc, goab, meta, goab.desc.meta[ccc]+kkk);
goab.ident += '\t';
if (goab.print == true) { bux += goab.ident; }
break;
case this.e :
goab.ident = goab.ident.substr(0,(goab.ident.length-1));
bux = this.explain(ccc, goab, meta, goab.desc.meta[ccc]+kkk);
if (goab.print == true) { bux += goab.ident; }
break;
case this.K :
bux = this.explain(ccc, goab, meta, goab.desc.meta[ccc]+kkk);
if (goab.print == true) { bux += goab.ident; }
break;
case 0 :
default: bux = ccc; break;
}
return(bux);
}; // level
this.str2regex= function(lng,arr) {
}; // str2regex
this.arr2hash = function(lng,arr) {
//#? convert an array to a hash
/* Example:
EnDeTMP._chrs.ctrl = {
'fuchur' : 'n r t',
'foo' : [O,K,O],
'bar' : [K,O,K]
};
will be convertet by EnDeRE.arr2hash('foo',EnDeTMP._chrs.ctrl) to:
{
'n' : O,
'r' : K,
't' : O,
};
*/
var bbb = arr[lng];
if (bbb == undefined) { return(null); }
var ccc = arr['fuchur'].replace(/\s/g, ''); // avoid white spaces
var bux = []; bux.length = ccc.length;
var j = 0;
for (j=0; j<ccc.length; j++) { bux[ccc[j]] = bbb[j]; }
return(bux);
}; // arr2hash
this.checkraw = function(goab,meta,raw) {
//#? check for special string handling; reset goab.init['"'] if necessary
/* meta - the detetcted string delimiter
* raw - array with raw delimiter and prefixes from this.chrs.raw[goab.orig]
*/
var bux = 0;
switch (goab.orig) {
case ':Perl':
case ':PCRE':
case ':PHP' :
if (raw[0] == "'") { // raw string mode
goab.init['"'] = this.O;
goab.asis= false;
}
if (raw[0] == '"') {
goab.asis= true;
}
break;
case ':ModSecurity' :
break;
}
// #dbx alert('#dbx checkraw: '+raw[0]+' : '+goab.raw);
}; // checkraw
this.XML = new function() {
//#? container for reading XML files and evaluating their data
this.tag = 'user-regex'; // the XML tag we expect in the file
this.file = null;
this.data = function() {
//#? get XML data as JavaScript
var bux = EnDeRE.XML.file.getElementsByTagName(EnDeRE.XML.tag);
if (bux.length == 0) { return void(0); } // ToDo: what to do here??
/*
for (c=0;c<bux.length;c++) {
// #dbx alert('= = = = = = = = = = '+c);
if (bux[c].childNodes.length == 0) continue; // skip empty nodes
if (bux[c].nodeType != 1) continue; // skip empty or text nodes
var n = 0;
for (n=0;n<bux[c].childNodes.length;n++) {
if (bux[c].childNodes[n].nodeType != 4) continue; // skip non CDATA_SECTION_NODE
txt = bux[c].childNodes[n].nodeValue;
}
}
*/
/* above for-loop would be the correct way to get the data from the file but
but we are lazy as we know that there should/must be exactly one node and
this nodes must contain our data as CDATA text, hence we read it directly
*/
var kkk = '';
try { kkk = bux[0].childNodes[1].nodeValue; }
catch(e) { /*EnDeGUI.*/alert('** EnDeRE.XML.data: failed: ',e); }
// #dbx alert('\n'+kkk);
bux = null;
return(kkk);
}; // data
}; // XML
this.template = function(goab) {
//#? write JavaScript template for user definable RegEx; returns template
function _escapeK(src) { // escape javaScript's own escape character
return(src.replace(/([\\'])/g, '\\$1'));
};
function _array(obj,arr) {
var bux = "\n '" + _escapeK(obj) + "': [";
bux += "\n\t'" + arr.join("',\n\t'") + "'\n\t], // " + obj;
return(bux);
};
function _serialize(idx,obj) {
var tab = '\n';
var i = 0, k = 0;
for (i=0; i<idx; i++) { tab += '\t'; }
idx++;
var bbb = typeof(obj);
if (bbb.match(/object/i) != null) {
bux = '{';
for (k in obj) { bux += tab + "'" + _escapeK(k) + "'\t:" + _serialize(idx,obj[k]); }
bux += tab + "'dumm'\t:0\n\t}, // ";
return(bux);
} else {
if (bbb.match(/number/i) != null) { return(0 + ','); }
return(" '" + obj + "',");
}
return(obj); // dummy, never reached
};
var bbb = '\n // remove of comment out unsupported ';
var c,j,k;
goab.quantifier = this.chrs.quantifier.fuchur; // we want to print all
var bux = '<?xml version="1.0"?>\n<' + EnDeRE.XML.tag + '><![CDATA[';
bux += '\n/* Do not change the XML tag name!';
bux += '\n *\n * Replace 0 as value for supported features in your RegEx';
bux += '\n * in following arrays. The values allowed beside preset 0 are:';
var ccc = [this.x,this.a,this.y,this.i,this.d,this.e, this.h,this.O,this.r,this.o,this.K];
//for (c in EnDeTMP._desc.prototype.idx) {
for (c in ccc) {
bux += '\n * ' + ccc[c] + ' : ' /*+ _idx(ccc[c])*/ + EnDeTMP._desc.prototype.idx[ccc[c]];
}
bux += '\n *\n * Do not remove any hash entries!';
bux += '\n * Do not change strings of the hash keys!';
bux += '\n * Strings in array values can be changed or removed.';
bux += "\n * Ensure that XML's CDATA syntax is not broken.";
bux += '\n *\n * For detailed description of following keys see help page ([?] button).';
bux += '\n */\n\n';
bux += '\nfunction _user() {};// container for user defined regex flavour';
bux += '\n_user.prototype = {';
bux += "\n 'lang'\t: ':user-regex', // ** DO NOT CHANGE THIS **";
bux += "\n 'user'\t: '-- your description here --',";
bux += "\n 'escchr'\t: '\\\\', // set the escape character here";
for (j in goab) {
switch (j) {
case 'quantifier':
bux += bbb + j;
bux += _array(j, goab[j]);
break;
case 'literal':
// ToDo: JavaScript's typeof() is too stupid to identify arrays
// uniquely, hence this one done manually ...
bux += "\n '" + _escapeK(j) + "': {";
for (k in goab[j]) {
if (k == 'desc') { continue; }
bux += '\t' + bbb + j + '.' + k;
bux += _array(k, goab[j][k]);
}
bux += "\n\t}, // " + j;
break;
case 'ctrl':
case 'clss':
case 'meta':
case 'escp':
case 'prop':
case 'init':
case 'anchor':
case 'modifier':
case 'ctxctrl':
case 'ctxclss':
case 'ctxmeta':
case 'ctxlook':
case 'ctxtype':
case 'ctxmod':
bux += "\n '" + _escapeK(j) + "': ";
bux += _serialize(1,goab[j]) + j;// + '\n';
break;
default: continue; break; // anything else not yet used
}
}
// bux += "\n 'raw' : { ':user-regex': ['\"'] },";
// bux += "\n 'subs' : { ':user-regex': ['whatever','prefix','here'] },";
bux += "\n 'dumm'\t:0";
bux += "\n}; // _user";
bux += "\n\n_user.parseInit = function(goab) {";
bux += "\n //#? user definable settings";
bux += "\n\talert('_user.parseInit: ');";
bux += "\n\n\t/* write your initialization code here .. */";
bux += "\n\n\t//return(); // **DON'T USE it, as it will break eval()";
/* return(); does not work as eval()ing this function fails */
bux += "\n};";
bux += "\n]]></" + EnDeRE.XML.tag + ">\n";
return(bux);
}; // writeUser
this.parseInit= function(goab) {
//#? special initialization for specified language
/* to be called after initialization of goab{} !! */
var j = 0;
switch (goab.orig) {
case ':Java-prop' : goab.init['"'] = this.O; break;
case ':VisualSt' : goab.ic = ''; break; // Todo add goab.desc.propVS to goab.desc.prop
case ':VB.NET' : for (j in goab.ctrl) { goab.ctrl[j] = this.O; }; break;
// ToDo: more comming here ...
}
/*
:lex:
{{egal}} bezeichnet eine Variable egal
- in [] Klasse kann als erstes oder letztes Zeichen stehen
Texte in " oder ' eingeschlossen sind String-Literale
/ ist ein "lookahead" Operator
:elvis
\@ matches word under cursor
\= indicate where to put cursor after match
:nvi
kann von BRE nach ERE umgeschalten werden mit: set extended
:vim, :vile, :elvis
im extended Mode wird () statt \(\) benutzt;
erlaubt \{n,m} statt \{n,m\}
:JavaScript, [\u0400-04ff] \u nur am Anfang
\b ist Anchor aber innerhalb [] backspace
:Java, :C# \t wird tab, \w liefert error
Sun's java.util.regex kennt Unicode fuer \b aber nicht fuer \w
\w, \d, \s matches only US-ASCII
in free formating mode spaces are not allowed in character classes
:C# hat Strings mit ".." und raw Strings mit @".."
:VB.NET hat Strings mit ".." einzigstes Escape ist " selbst fuer ein "
:Perl hat Strings mit ".." und '..'
".." \-escapes werden umgewandelt, {..} ist Ergebnis des Perl-Codes, $variablen werden expandiert
\Q und \E haben besondere Bedeutung in ".."; \N nur in ".." moeglich
m?..? ist besonders, da nur einmal moeglich
:PHP hat Strings mit ".." und '..'
".." \-escapes werden umgewandelt, {..} ist Ergebnis des PHP-Codes, $variablen werden expandiert
\t wird tab, aber \w bleibt \w
'..' \ bleibt \ , also muss nur \ und ' selbst escaped werden
cannot use named variable reference more than once
word boundery shorhands work with ASCII only
bei preg_match* ist RegEx ein String: preg_match_all('/(to|the|t.xt)/', input, $match)
:Python kennt ".." und '..', das ist kein Unterschied
aber '''...''' erlaubt newlines, r".." ist raw String *ohne Escape, d.h. \ bleibt*
:Tcl see http://www.tcl.tk/man/tcl8.5/TclCmd/re_syntax.htm
Within bracket expressions, \d, \s, and \w lose their outer brackets, and \D,
\S, and \W are illegal. (So, for example, [a-c\d] is equivalent to
[a-c[:digit:]]. Also, [a-c\D], which is equivalent to [a-c^[:digit:]], is illegal.)
***: und ***= innerhalb der RegEx besonders
:Ruby
benutzt POSIX (behauptet aber PCRE)
Erweiterungen:
http://www.rubyfu.com/2007/06/named-captures-for-regular-expressions.html
*/
}; // parseInit
/*
var n='name';var x=new XML('<foo {n}="bar">heureca</foo>');alert(x.toXMLString()+'\n'+x.@name);
var n='name';var x=<foo {n}="bar">heureca</foo>;alert(x.toXMLString()+'\n'+x.@name);
var n='name';var x=<foo {n}="42"><bar id="5">ooh</bar><bar id="3">heureca</bar></foo>;alert(x.toXMLString()+'\n'+x[/bar@id="3"]);
*/
this.parse = function(src,lng,pre,cmt,raw) {
//#? regular expression parser
/* src - the text of the RegEx
* lng - language/flavour to analyze
* pre - pattern for prefix (may be empty)
* cmt - true if comment/description should be added
* raw - true if RegEx is raw data or prefixed/suffixed by other text
* (anything left of / or " is ignored)
*
* it should be possible to call this fnction multiple times simultaneously
*/
function delobj(arr) {
if (arr==undefined) { arr = null; return; }
if ((typeof arr).match(/(boolean|number)/i)!=null) { delete arr; return; }
if ((typeof arr).match(/string/i)!=null) { arr = null; return; }
//if ((typeof arr).match(/string/i)!=null) { arr = ''; delete arr; return; }
var j = '';
if ((typeof arr).match(/object/i)!=null) {
for (j in arr) {
delobj(arr[j]);
arr[j] = null;
delete arr[j];
}
arr.length = 0;
arr = null;
delete arr;
// if (arr!=undefined) { alert(arr); }
} else { // hopefully never reached
if ((typeof arr).match(/array/i)!=null) {
arr.length = null;
arr = null;
}
}
};
var bux = '';
var bbb = '';
var ccc = null;
var kkk = '';
var hex = '';
var h, k, m;
var goab = new EnDeTMP._goab();