Download presentation
Presentation is loading. Please wait.
Published byArthur Allen Modified over 9 years ago
2
Fast and Precise Sanitizer Analysis with B EK Pieter Hooimeijer Ben Livshits David Molnar Prateek Saxena Margus Veanes 2011-08-10 USENIX Security
3
3
4
4
5
5 Q UESTION : What could possibly go wrong?
6
6 Attacker: im.png' onload='javascript:...
7
7 Attacker: im.png' onload='javascript:...
8
8 Attacker: im.png' onload='javascript:... Result: <img src='im.png' onload='javascri
9
9 Attacker: im.png' onload='javascript:... Result: <img src='im.png' onload='javascri FAIL
10
10
11
11 A tale of two sanitizers…
12
12 ' ' single quote html entity
13
13 some untrusted input
14
14 Library A Name: Around for: Availability: HtmlEncode Years Readily available to C# developers some untrusted input
15
15 Library A Name: Around for: Availability: Library B Name: Around for: Availability: HtmlEncode Years Readily available to C# developers HtmlEncode Years Readily available to C# developers some untrusted input
16
16 Library A Name: Around for: Availability: Library B Name: Around for: Availability: HtmlEncode Years Readily available to C# developers HtmlEncode Years Readily available to C# developers ' ' ' ' ✔ ✘
17
17 public static string HtmlEncode(string s) { if (s == null) return null; int num = IndexOfHtmlEncodingChars(s, 0); if (num == -1) return s; StringBuilder builder=new StringBuilder(s.Length+5); int length = s.Length; int startIndex = 0; Label_002A: if (num > startIndex) { builder.Append(s, startIndex, num-startIndex); } char ch = s[num]; if (ch > '>') { builder.Append("&#"); builder.Append(((int) ch).ToString(NumberFormatInfo.InvariantInfo)); builder.Append(';'); } else { char ch2 = ch; if (ch2 != '"') { switch (ch2) { case '<': builder.Append("<"); goto Label_00D5; case '=': goto Label_00D5; case '>': builder.Append(">"); goto Label_00D5; case '&': builder.Append("&"); goto Label_00D5; } else { builder.Append("""); } Label_00D5: startIndex = num + 1; if (startIndex < length) { num = IndexOfHtmlEncodingChars(s, startIndex); if (num != -1) { goto Label_002A; } builder.Append(s, startIndex, length-startIndex); } return builder.ToString(); }.NET WebUtility MS AntiXSS private static string HtmlEncode(string input, bool useNamedEntities, MethodSpecificEncoder encoderTweak) { if (string.IsNullOrEmpty(input)) { return input; } if (characterValues == null) { InitialiseSafeList(); } if (useNamedEntities && namedEntities == null) { InitialiseNamedEntityList(); } // Setup a new character array for output. char[] inputAsArray = input.ToCharArray(); int outputLength = 0; int inputLength = inputAsArray.Length; char[] encodedInput = new char[inputLength * 10]; SyncLock.EnterReadLock(); try { for (int i = 0; i < inputLength; i++) { char currentCharacter = inputAsArray[i]; int currentCodePoint = inputAsArray[i]; char[] tweekedValue; // Check for invalid values if (currentCodePoint == 0xFFFE || currentCodePoint == 0xFFFF) { throw new InvalidUnicodeValueException(currentCodePoint); } else if (char.IsHighSurrogate(currentCharacter)) { if (i + 1 == inputLength) { throw new InvalidSurrogatePairException(currentCharacter, '\0'); } // Now peak ahead and check if the following character is a low surrogate. char nextCharacter = inputAsArray[i + 1]; char nextCodePoint = inputAsArray[i + 1]; if (!char.IsLowSurrogate(nextCharacter)) { throw new InvalidSurrogatePairException(currentCharacter, nextCharacter); } // Look-ahead was good, so skip. i++; // Calculate the combined code point long combinedCodePoint = 0x10000 + ((currentCodePoint - 0xD800) * 0x400) + (nextCodePoint - 0xDC00); char[] encodedCharacter = SafeList.HashThenValueGenerator(combinedCodePoint); encodedInput[outputLength++] = '&'; for (int j = 0; j < encodedCharacter.Length; j++) { encodedInput[outputLength++] = encodedCharacter[j]; } encodedInput[outputLength++] = ';'; } else if (char.IsLowSurrogate(currentCharacter)) { throw new InvalidSurrogatePairException('\0', currentCharacter); } else if (encoderTweak != null && encoderTweak(currentCharacter, out tweekedValue)) { for (int j = 0; j < tweekedValue.Length; j++) { encodedInput[outputLength++] = tweekedValue[j]; } else if (useNamedEntities && namedEntities[currentCodePoint] != null) { char[] encodedCharacter = namedEntities[currentCodePoint]; encodedInput[outputLength++] = '&'; for (int j = 0; j < encodedCharacter.Length; j++) { encodedInput[outputLength++] = encodedCharacter[j]; } encodedInput[outputLength++] = ';'; } else if (characterValues[currentCodePoint] != null) { // character needs to be encoded char[] encodedCharacter = characterValues[currentCodePoint]; encodedInput[outputLength++] = '&'; for (int j = 0; j < encodedCharacter.Length; j++) { encodedInput[outputLength++] = encodedCharacter[j]; } encodedInput[outputLength++] = ';'; } else { // character does not need encoding encodedInput[outputLength++] = currentCharacter; } finally { SyncLock.ExitReadLock(); } return new string(encodedInput, 0, outputLength); }
18
private static string HtmlEncode(string input, bool useNamedEntities, MethodSpecificEncoder encoderTweak) { if (string.IsNullOrEmpty(input)) { return input; } if (characterValues == null) { InitialiseSafeList(); } if (useNamedEntities && namedEntities == null) { InitialiseNamedEntityList(); } // Setup a new character array for output. char[] inputAsArray = input.ToCharArray(); int outputLength = 0; int inputLength = inputAsArray.Length; char[] encodedInput = new char[inputLength * 10]; SyncLock.EnterReadLock(); try { for (int i = 0; i < inputLength; i++) { char currentCharacter = inputAsArray[i]; int currentCodePoint = inputAsArray[i]; char[] tweekedValue; // Check for invalid values if (currentCodePoint == 0xFFFE || currentCodePoint == 0xFFFF) { throw new InvalidUnicodeValueException(currentCodePoint); } else if (char.IsHighSurrogate(currentCharacter)) { if (i + 1 == inputLength) { throw new InvalidSurrogatePairException(currentCharacter, '\0'); } // Now peak ahead and check if the following character is a low surrogate. char nextCharacter = inputAsArray[i + 1]; char nextCodePoint = inputAsArray[i + 1]; if (!char.IsLowSurrogate(nextCharacter)) { throw new InvalidSurrogatePairException(currentCharacter, nextCharacter); } // Look-ahead was good, so skip. i++; // Calculate the combined code point long combinedCodePoint = 0x10000 + ((currentCodePoint - 0xD800) * 0x400) + (nextCodePoint - 0xDC00); char[] encodedCharacter = SafeList.HashThenValueGenerator(combinedCodePoint); encodedInput[outputLength++] = '&'; for (int j = 0; j < encodedCharacter.Length; j++) { encodedInput[outputLength++] = encodedCharacter[j]; } encodedInput[outputLength++] = ';'; } else if (char.IsLowSurrogate(currentCharacter)) { throw new InvalidSurrogatePairException('\0', currentCharacter); } else if (encoderTweak != null && encoderTweak(currentCharacter, out tweekedValue)) { for (int j = 0; j < tweekedValue.Length; j++) { encodedInput[outputLength++] = tweekedValue[j]; } else if (useNamedEntities && namedEntities[currentCodePoint] != null) { char[] encodedCharacter = namedEntities[currentCodePoint]; encodedInput[outputLength++] = '&'; for (int j = 0; j < encodedCharacter.Length; j++) { encodedInput[outputLength++] = encodedCharacter[j]; } encodedInput[outputLength++] = ';'; } else if (characterValues[currentCodePoint] != null) { // character needs to be encoded char[] encodedCharacter = characterValues[currentCodePoint]; encodedInput[outputLength++] = '&'; for (int j = 0; j < encodedCharacter.Length; j++) { encodedInput[outputLength++] = encodedCharacter[j]; } encodedInput[outputLength++] = ';'; } else { // character does not need encoding encodedInput[outputLength++] = currentCharacter; } finally { SyncLock.ExitReadLock(); } return new string(encodedInput, 0, outputLength); } public static string HtmlEncode(string s) { if (s == null) return null; int num = IndexOfHtmlEncodingChars(s, 0); if (num == -1) return s; StringBuilder builder=new StringBuilder(s.Length+5); int length = s.Length; int startIndex = 0; Label_002A: if (num > startIndex) { builder.Append(s, startIndex, num-startIndex); } char ch = s[num]; if (ch > '>') { builder.Append("&#"); builder.Append(((int) ch).ToString(NumberFormatInfo.InvariantInfo)); builder.Append(';'); } else { char ch2 = ch; if (ch2 != '"') { switch (ch2) { case '<': builder.Append("<"); goto Label_00D5; case '=': goto Label_00D5; case '>': builder.Append(">"); goto Label_00D5; case '&': builder.Append("&"); goto Label_00D5; } else { builder.Append("""); } Label_00D5: startIndex = num + 1; if (startIndex < length) { num = IndexOfHtmlEncodingChars(s, startIndex); if (num != -1) { goto Label_002A; } builder.Append(s, startIndex, length-startIndex); } return builder.ToString(); } 18.NET WebUtility MS AntiXSS Same behavior on all inputs? If not, what is a differentiating input? Can it generate any known ‘bad’ outputs?
19
19 A tale of 151 sanitizers…
20
20 PHP Trunk Changes to html.c, 1999—2011
21
21 PHP Trunk Changes to html.c, 1999—2011 R7,841 April 1999 135 loc R309,482 March 2011 1693 loc
22
22 PHP Trunk Changes to html.c, 1999—2011 R32,564 September 2000 ENT_QUOTES introduced R7,841 April 1999 135 loc R309,482 March 2011 1693 loc
23
23 PHP Trunk Changes to html.c, 1999—2011 R32,564 September 2000 ENT_QUOTES introduced R242,949 September 2007 $double_encode=true R7,841 April 1999 135 loc R309,482 March 2011 1693 loc
24
24 PHP Trunk Changes to html.c, 1999—2011 Safe to apply twice? Safe to combine with other sanitizers?
25
Motivation 25 Writing string sanitizers correctly is difficult There is no cheap way to identify problems with sanitizers ‘Correctness’ is a moving target What if we could say more about sanitizer behavior?
26
26 B EK Frontend: a small language for string manipulation; similar to how sanitizers are written today Backend: a model based on symbolic finite transducers with algorithms for analysis and code generation B EK Frontend: a small language for string manipulation; similar to how sanitizers are written today Backend: a model based on symbolic finite transducers with algorithms for analysis and code generation Contributions
27
27 B EK Frontend: a small language for string manipulation; similar to how sanitizers are written today Backend: a model based on symbolic finite transducers with algorithms for analysis and code generation B EK Frontend: a small language for string manipulation; similar to how sanitizers are written today Backend: a model based on symbolic finite transducers with algorithms for analysis and code generation Contributions Evaluation Converted sanitizers from a variety of sources Checked properties like reversibility, idempotence, equivalence, and commutativity Evaluation Converted sanitizers from a variety of sources Checked properties like reversibility, idempotence, equivalence, and commutativity
28
28 B EK Frontend: a small language for string manipulation; similar to how sanitizers are written today Backend: a model based on symbolic finite transducers with algorithms for analysis and code generation B EK Frontend: a small language for string manipulation; similar to how sanitizers are written today Backend: a model based on symbolic finite transducers with algorithms for analysis and code generation Contributions Evaluation Converted sanitizers from a variety of sources Checked properties like reversibility, idempotence, equivalence, and commutativity Evaluation Converted sanitizers from a variety of sources Checked properties like reversibility, idempotence, equivalence, and commutativity
29
29 s := iter(c in t)[b := false;] { case (!b && c in "[\"\\]"): b := false; yield('\\', c); case (c == '\\'): b := !b; yield(c); case (true): b := false; yield(c); }; Bek Program B EK : Architecture
30
30 Symbolic Finite Transducers Z3 Transformation Microsoft.Automata s := iter(c in t)[b := false;] { case (!b && c in "[\"\\]"): b := false; yield('\\', c); case (c == '\\'): b := !b; yield(c); case (true): b := false; yield(c); }; Bek Program B EK : Architecture
31
31 Symbolic Finite Transducers Z3 Transformation Analysis Does it do the right thing? Counterexample “\' vs. \\'” Microsoft.Automata s := iter(c in t)[b := false;] { case (!b && c in "[\"\\]"): b := false; yield('\\', c); case (c == '\\'): b := !b; yield(c); case (true): b := false; yield(c); }; Bek Program B EK : Architecture
32
32 Symbolic Finite Transducers Z3 Transformation Analysis Does it do the right thing? Counterexample “\' vs. \\'” Microsoft.Automata s := iter(c in t)[b := false;] { case (!b && c in "[\"\\]"): b := false; yield('\\', c); case (c == '\\'): b := !b; yield(c); case (true): b := false; yield(c); }; Bek Program Code Gen C#JavaScriptC Code Gen B EK : Architecture
33
33 Symbolic Finite Transducers Z3 Transformation Analysis Does it do the right thing? Counterexample “\' vs. \\'” Microsoft.Automata s := iter(c in t)[b := false;] { case (!b && c in "[\"\\]"): b := false; yield('\\', c); case (c == '\\'): b := !b; yield(c); case (true): b := false; yield(c); }; Bek Program Code Gen C#JavaScriptC Code Gen B EK : Architecture
34
34 t := iter(c in s)[b := false;] { case (!b && c in "['\"]"): b := false; yield('\\', c); case (c == '\\'): b := !b; yield(c); case (true): b := false; yield(c); }; A B EK Program: Escape Quotes
35
35 t := iter(c in s)[b := false;] { case (!b && c in "['\"]"): b := false; yield('\\', c); case (c == '\\'): b := !b; yield(c); case (true): b := false; yield(c); }; A B EK Program: Escape Quotes iterate over the characters in string s
36
A B EK Program: Escape Quotes 36 t := iter(c in s)[b := false;] { case (!b && c in "['\"]"): b := false; yield('\\', c); case (c == '\\'): b := !b; yield(c); case (true): b := false; yield(c); }; iterate over the characters in string s while updating one boolean variable b
37
37 Symbolic Finite Transducers Z3 Transformation Analysis Does it do the right thing? Counterexample “\' vs. \\'” Microsoft.Automata s := iter(c in t)[b := false;] { case (!b && c in "[\"\\]"): b := false; yield('\\', c); case (c == '\\'): b := !b; yield(c); case (true): b := false; yield(c); }; Bek Program Code Gen C#JavaScriptC Code Gen B EK : Architecture
38
38 A Symbolic Finite Transducer
39
39 A Symbolic Finite Transducer symbolic predicates
40
40 output lists A Symbolic Finite Transducer symbolic predicates
41
41 Symbolic Finite Transducers Z3 Transformation Analysis Does it do the right thing? Counterexample “\' vs. \\'” Microsoft.Automata s := iter(c in t)[b := false;] { case (!b && c in "[\"\\]"): b := false; yield('\\', c); case (c == '\\'): b := !b; yield(c); case (true): b := false; yield(c); }; Bek Program Code Gen C#JavaScriptC Code Gen B EK : Architecture
42
42 Symbolic Finite Transducers Z3 Transformation Analysis Does it do the right thing? Counterexample “\' vs. \\'” Microsoft.Automata s := iter(c in t)[b := false;] { case (!b && c in "[\"\\]"): b := false; yield('\\', c); case (c == '\\'): b := !b; yield(c); case (true): b := false; yield(c); }; Bek Program Code Gen C#JavaScriptC Code Gen B EK : Architecture Now what?
43
SFT Algorithms 43 Equivalence Checking
44
SFT Algorithms 44 Equivalence Checking
45
SFT Algorithms 45 Join Composition SFT A B inout SFT A inout SFT B
46
SFT Algorithms 46 Join Composition SFT A B inout SFT A inout SFT B
47
47 Pre-Image Computation in SFT A Regular Language S
48
48 Pre-Image Computation in SFT A Regular Language S ?
49
49 B EK Frontend: a small language for string manipulation; similar to how sanitizers are written today Backend: a model based on symbolic finite transducers with algorithms for analysis and code generation B EK Frontend: a small language for string manipulation; similar to how sanitizers are written today Backend: a model based on symbolic finite transducers with algorithms for analysis and code generation Contributions Evaluation Converted sanitizers from a variety of sources Checked properties like reversibility, idempotence, equivalence, and commutativity Evaluation Converted sanitizers from a variety of sources Checked properties like reversibility, idempotence, equivalence, and commutativity
50
50 Some Questions What features are needed to port existing sanitizers? Can we check interesting properties on real sanitizers? Will HtmlEnc implementations protect against XSS Cheat Sheet samples?
51
Language Features 51 Data: 1x OWASP esapi HTMLencode 13x Google Ctemplate AutoEscape 21x IE 8 XSS Filter 7x Synthetic inspect feature counts What features are needed to port existing sanitizers?
52
Language Features 52 What features are needed to port existing sanitizers? Majority (76%) of sanitizers can be ported without extending the language With multi-character lookahead: 90%
53
53 Data 4x MS internal HtmlEncode 3x ‘for hire’ HtmlEncode based on English-language specification (C#) Commutative? Equivalent? Can we check interesting properties on real sanitizers?
54
54 Can we check interesting properties on real sanitizers? Short answer: Yes!
55
55 Short answer: Yes! EQ results take less than a minute to obtain: 1234567 1 ✔✔✔✘✘✔✘ 2 ✔✔✘✘✔✘ 3 ✔✘✘✔✘ 4 ✔✘✘✘ 5 ✔✘✘ 6 ✔✘ 7 ✔ Can we check interesting properties on real sanitizers?
56
The Cheat Sheet 56 Will HtmlEnc protect against known XSS strings? in SFT A Regular Language S ?
57
The Cheat Sheet 57 Will HtmlEnc protect against known XSS strings? One out of seven implementations correctly encodes all strings for use in both HTML and attribute contexts
58
58 B EK is a domain-specific language for writing string sanitizers We model B EK programs without approximation using symbolic finite transducers, enabling e.g., equivalence checks We evaluate our system using real-world sanitizers from a variety of different sources Conclusion
59
Thanks! http://research.microsoft.com/en-us/projects/bek/ http://www.rise4fun.com/bek/
60
Demo Time
61
61 Randomly-generated B EK programs, parameterized on SFT size Commutative? Equivalent? Scalability: Approach
62
62 CommutativitySelf-Equivalence Scalability: Results
63
63 100 PHP projects scrape 9.6 million lines of PHP static count usage stats for 111 distinct PHP library functions Sanitizer use in PHP code: Approach
64
64 Sanitizer use in PHP code: Results
Similar presentations
© 2025 SlidePlayer.com. Inc.
All rights reserved.