Skip to content

Commit 2dfffdb

Browse files
authored
Merge pull request github#4590 from RasmusWL/python-model-base64
Python: Model encoding/decoding with base64 module
2 parents cb527ca + 247fd4f commit 2dfffdb

File tree

9 files changed

+287
-23
lines changed

9 files changed

+287
-23
lines changed

python/ql/src/experimental/semmle/python/Concepts.qll

Lines changed: 75 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@
66

77
import python
88
private import experimental.dataflow.DataFlow
9-
private import experimental.semmle.python.Frameworks
109
private import experimental.dataflow.RemoteFlowSources
10+
private import experimental.dataflow.TaintTracking
11+
private import experimental.semmle.python.Frameworks
1112

1213
/**
1314
* A data-flow node that executes an operating system command,
@@ -113,8 +114,9 @@ module Path {
113114
* is intended to include deserialization, unmarshalling, decoding, unpickling,
114115
* decompressing, decrypting, parsing etc.
115116
*
116-
* Doing so should normally preserve taint, but it can also be a problem
117-
* in itself, e.g. if it allows code execution or could result in denial-of-service.
117+
* A decoding (automatically) preserves taint from input to output. However, it can
118+
* also be a problem in itself, for example if it allows code execution or could result
119+
* in denial-of-service.
118120
*
119121
* Extend this class to refine existing API models. If you want to model new APIs,
120122
* extend `Decoding::Range` instead.
@@ -144,8 +146,9 @@ module Decoding {
144146
* is intended to include deserialization, unmarshalling, decoding, unpickling,
145147
* decompressing, decrypting, parsing etc.
146148
*
147-
* Doing so should normally preserve taint, but it can also be a problem
148-
* in itself, e.g. if it allows code execution or could result in denial-of-service.
149+
* A decoding (automatically) preserves taint from input to output. However, it can
150+
* also be a problem in itself, for example if it allows code execution or could result
151+
* in denial-of-service.
149152
*
150153
* Extend this class to model new APIs. If you want to refine existing API models,
151154
* extend `Decoding` instead.
@@ -165,6 +168,73 @@ module Decoding {
165168
}
166169
}
167170

171+
private class DecodingAdditionalTaintStep extends TaintTracking::AdditionalTaintStep {
172+
override predicate step(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) {
173+
exists(Decoding decoding |
174+
nodeFrom = decoding.getAnInput() and
175+
nodeTo = decoding.getOutput()
176+
)
177+
}
178+
}
179+
180+
/**
181+
* A data-flow node that encodes data to a binary or textual format. This
182+
* is intended to include serialization, marshalling, encoding, pickling,
183+
* compressing, encrypting, etc.
184+
*
185+
* An encoding (automatically) preserves taint from input to output.
186+
*
187+
* Extend this class to refine existing API models. If you want to model new APIs,
188+
* extend `Encoding::Range` instead.
189+
*/
190+
class Encoding extends DataFlow::Node {
191+
Encoding::Range range;
192+
193+
Encoding() { this = range }
194+
195+
/** Gets an input that is encoded by this function. */
196+
DataFlow::Node getAnInput() { result = range.getAnInput() }
197+
198+
/** Gets the output that contains the encoded data produced by this function. */
199+
DataFlow::Node getOutput() { result = range.getOutput() }
200+
201+
/** Gets an identifier for the format this function decodes from, such as "JSON". */
202+
string getFormat() { result = range.getFormat() }
203+
}
204+
205+
/** Provides a class for modeling new encoding mechanisms. */
206+
module Encoding {
207+
/**
208+
* A data-flow node that encodes data to a binary or textual format. This
209+
* is intended to include serialization, marshalling, encoding, pickling,
210+
* compressing, encrypting, etc.
211+
*
212+
* An encoding (automatically) preserves taint from input to output.
213+
*
214+
* Extend this class to model new APIs. If you want to refine existing API models,
215+
* extend `Encoding` instead.
216+
*/
217+
abstract class Range extends DataFlow::Node {
218+
/** Gets an input that is encoded by this function. */
219+
abstract DataFlow::Node getAnInput();
220+
221+
/** Gets the output that contains the encoded data produced by this function. */
222+
abstract DataFlow::Node getOutput();
223+
224+
/** Gets an identifier for the format this function decodes from, such as "JSON". */
225+
abstract string getFormat();
226+
}
227+
}
228+
229+
private class EncodingAdditionalTaintStep extends TaintTracking::AdditionalTaintStep {
230+
override predicate step(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) {
231+
exists(Encoding encoding |
232+
nodeFrom = encoding.getAnInput() and
233+
nodeTo = encoding.getOutput()
234+
)
235+
}
236+
}
237+
168238
/**
169239
* A data-flow node that dynamically executes Python code.
170240
*

python/ql/src/experimental/semmle/python/frameworks/Stdlib.qll

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -753,6 +753,131 @@ private class OpenCall extends FileSystemAccess::Range, DataFlow::CfgNode {
753753
}
754754
}
755755

756+
// ---------------------------------------------------------------------------
757+
// base64
758+
// ---------------------------------------------------------------------------
759+
/** Gets a reference to the `base64` module. */
760+
private DataFlow::Node base64(DataFlow::TypeTracker t) {
761+
t.start() and
762+
result = DataFlow::importNode("base64")
763+
or
764+
exists(DataFlow::TypeTracker t2 | result = base64(t2).track(t2, t))
765+
}
766+
767+
/** Gets a reference to the `base64` module. */
768+
DataFlow::Node base64() { result = base64(DataFlow::TypeTracker::end()) }
769+
770+
/**
771+
* Gets a reference to the attribute `attr_name` of the `base64` module.
772+
* WARNING: Only holds for a few predefined attributes.
773+
*/
774+
private DataFlow::Node base64_attr(DataFlow::TypeTracker t, string attr_name) {
775+
attr_name in ["b64encode", "b64decode", "standard_b64encode", "standard_b64decode",
776+
"urlsafe_b64encode", "urlsafe_b64decode", "b32encode", "b32decode", "b16encode",
777+
"b16decode", "encodestring", "decodestring", "a85encode", "a85decode", "b85encode",
778+
"b85decode", "encodebytes", "decodebytes"] and
779+
(
780+
t.start() and
781+
result = DataFlow::importNode("base64" + "." + attr_name)
782+
or
783+
t.startInAttr(attr_name) and
784+
result = base64()
785+
)
786+
or
787+
// Due to bad performance when using normal setup with `base64_attr(t2, attr_name).track(t2, t)`
788+
// we have inlined that code and forced a join
789+
exists(DataFlow::TypeTracker t2 |
790+
exists(DataFlow::StepSummary summary |
791+
base64_attr_first_join(t2, attr_name, result, summary) and
792+
t = t2.append(summary)
793+
)
794+
)
795+
}
796+
797+
pragma[nomagic]
798+
private predicate base64_attr_first_join(
799+
DataFlow::TypeTracker t2, string attr_name, DataFlow::Node res, DataFlow::StepSummary summary
800+
) {
801+
DataFlow::StepSummary::step(base64_attr(t2, attr_name), res, summary)
802+
}
803+
804+
/**
805+
* Gets a reference to the attribute `attr_name` of the `base64` module.
806+
* WARNING: Only holds for a few predefined attributes.
807+
*/
808+
private DataFlow::Node base64_attr(string attr_name) {
809+
result = base64_attr(DataFlow::TypeTracker::end(), attr_name)
810+
}
811+
812+
/** A call to any of the encode functions in the `base64` module. */
813+
private class Base64EncodeCall extends Encoding::Range, DataFlow::CfgNode {
814+
override CallNode node;
815+
816+
Base64EncodeCall() {
817+
exists(string name |
818+
name in ["b64encode", "standard_b64encode", "urlsafe_b64encode", "b32encode", "b16encode",
819+
"encodestring", "a85encode", "b85encode", "encodebytes"] and
820+
node.getFunction() = base64_attr(name).asCfgNode()
821+
)
822+
}
823+
824+
override DataFlow::Node getAnInput() { result.asCfgNode() = node.getArg(0) }
825+
826+
override DataFlow::Node getOutput() { result = this }
827+
828+
override string getFormat() {
829+
exists(string name | node.getFunction() = base64_attr(name).asCfgNode() |
830+
name in ["b64encode", "standard_b64encode", "urlsafe_b64encode", "encodestring", "encodebytes"] and
831+
result = "Base64"
832+
or
833+
name = "b32encode" and result = "Base32"
834+
or
835+
name = "b16encode" and result = "Base16"
836+
or
837+
name = "a85encode" and result = "Ascii85"
838+
or
839+
name = "b85encode" and result = "Base85"
840+
)
841+
}
842+
}
843+
844+
/** A call to any of the decode functions in the `base64` module. */
845+
private class Base64DecodeCall extends Decoding::Range, DataFlow::CfgNode {
846+
override CallNode node;
847+
848+
Base64DecodeCall() {
849+
exists(string name |
850+
name in ["b64decode", "standard_b64decode", "urlsafe_b64decode", "b32decode", "b16decode",
851+
"decodestring", "a85decode", "b85decode", "decodebytes"] and
852+
node.getFunction() = base64_attr(name).asCfgNode()
853+
)
854+
}
855+
856+
override predicate mayExecuteInput() { none() }
857+
858+
override DataFlow::Node getAnInput() { result.asCfgNode() = node.getArg(0) }
859+
860+
override DataFlow::Node getOutput() { result = this }
861+
862+
override string getFormat() {
863+
exists(string name | node.getFunction() = base64_attr(name).asCfgNode() |
864+
name in ["b64decode", "standard_b64decode", "urlsafe_b64decode", "decodestring", "decodebytes"] and
865+
result = "Base64"
866+
or
867+
name = "b32decode" and result = "Base32"
868+
or
869+
name = "b16decode" and result = "Base16"
870+
or
871+
name = "a85decode" and result = "Ascii85"
872+
or
873+
name = "b85decode" and result = "Base85"
874+
)
875+
}
876+
}
877+
878+
// ---------------------------------------------------------------------------
879+
// OTHER
880+
// ---------------------------------------------------------------------------
756881
/**
757882
* A call to the `startswith` method on a string.
758883
* See https://docs.python.org/3.9/library/stdtypes.html#str.startswith

python/ql/test/experimental/dataflow/tainttracking/defaultAdditionalTaintStep-py3/TestTaint.expected

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,12 @@
1616
| test_string.py:17 | ok | str_methods | ts.casefold() |
1717
| test_string.py:19 | ok | str_methods | ts.format_map(..) |
1818
| test_string.py:20 | ok | str_methods | "{unsafe}".format_map(..) |
19-
| test_string.py:31 | fail | binary_decode_encode | base64.a85encode(..) |
20-
| test_string.py:32 | fail | binary_decode_encode | base64.a85decode(..) |
21-
| test_string.py:35 | fail | binary_decode_encode | base64.b85encode(..) |
22-
| test_string.py:36 | fail | binary_decode_encode | base64.b85decode(..) |
23-
| test_string.py:39 | fail | binary_decode_encode | base64.encodebytes(..) |
24-
| test_string.py:40 | fail | binary_decode_encode | base64.decodebytes(..) |
19+
| test_string.py:31 | ok | binary_decode_encode | base64.a85encode(..) |
20+
| test_string.py:32 | ok | binary_decode_encode | base64.a85decode(..) |
21+
| test_string.py:35 | ok | binary_decode_encode | base64.b85encode(..) |
22+
| test_string.py:36 | ok | binary_decode_encode | base64.b85decode(..) |
23+
| test_string.py:39 | ok | binary_decode_encode | base64.encodebytes(..) |
24+
| test_string.py:40 | ok | binary_decode_encode | base64.decodebytes(..) |
2525
| test_string.py:48 | ok | f_strings | Fstring |
2626
| test_unpacking.py:18 | ok | extended_unpacking | first |
2727
| test_unpacking.py:18 | ok | extended_unpacking | last |

python/ql/test/experimental/dataflow/tainttracking/defaultAdditionalTaintStep/TestTaint.expected

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -123,18 +123,18 @@
123123
| test_string.py:114 | ok | percent_fmt | BinaryExpr |
124124
| test_string.py:115 | ok | percent_fmt | BinaryExpr |
125125
| test_string.py:116 | ok | percent_fmt | BinaryExpr |
126-
| test_string.py:126 | fail | binary_decode_encode | base64.b64encode(..) |
127-
| test_string.py:127 | fail | binary_decode_encode | base64.b64decode(..) |
128-
| test_string.py:129 | fail | binary_decode_encode | base64.standard_b64encode(..) |
129-
| test_string.py:130 | fail | binary_decode_encode | base64.standard_b64decode(..) |
130-
| test_string.py:132 | fail | binary_decode_encode | base64.urlsafe_b64encode(..) |
131-
| test_string.py:133 | fail | binary_decode_encode | base64.urlsafe_b64decode(..) |
132-
| test_string.py:135 | fail | binary_decode_encode | base64.b32encode(..) |
133-
| test_string.py:136 | fail | binary_decode_encode | base64.b32decode(..) |
134-
| test_string.py:138 | fail | binary_decode_encode | base64.b16encode(..) |
135-
| test_string.py:139 | fail | binary_decode_encode | base64.b16decode(..) |
136-
| test_string.py:142 | fail | binary_decode_encode | base64.encodestring(..) |
137-
| test_string.py:143 | fail | binary_decode_encode | base64.decodestring(..) |
126+
| test_string.py:126 | ok | binary_decode_encode | base64.b64encode(..) |
127+
| test_string.py:127 | ok | binary_decode_encode | base64.b64decode(..) |
128+
| test_string.py:129 | ok | binary_decode_encode | base64.standard_b64encode(..) |
129+
| test_string.py:130 | ok | binary_decode_encode | base64.standard_b64decode(..) |
130+
| test_string.py:132 | ok | binary_decode_encode | base64.urlsafe_b64encode(..) |
131+
| test_string.py:133 | ok | binary_decode_encode | base64.urlsafe_b64decode(..) |
132+
| test_string.py:135 | ok | binary_decode_encode | base64.b32encode(..) |
133+
| test_string.py:136 | ok | binary_decode_encode | base64.b32decode(..) |
134+
| test_string.py:138 | ok | binary_decode_encode | base64.b16encode(..) |
135+
| test_string.py:139 | ok | binary_decode_encode | base64.b16decode(..) |
136+
| test_string.py:142 | ok | binary_decode_encode | base64.encodestring(..) |
137+
| test_string.py:143 | ok | binary_decode_encode | base64.decodestring(..) |
138138
| test_string.py:148 | fail | binary_decode_encode | quopri.encodestring(..) |
139139
| test_string.py:149 | fail | binary_decode_encode | quopri.decodestring(..) |
140140
| test_string.py:159 | ok | test_os_path_join | os.path.join(..) |
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
import base64
2+
3+
# TODO: These tests should be merged with python/ql/test/experimental/dataflow/tainttracking/defaultAdditionalTaintStep-py3/test_string.py
4+
base64.a85decode(payload) # $decodeInput=payload $decodeOutput=Attribute() $decodeFormat=Ascii85
5+
base64.b85decode(payload) # $decodeInput=payload $decodeOutput=Attribute() $decodeFormat=Base85
6+
base64.decodebytes(payload) # $decodeInput=payload $decodeOutput=Attribute() $decodeFormat=Base64
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
import base64
2+
3+
# TODO: These tests should be merged with python/ql/test/experimental/dataflow/tainttracking/defaultAdditionalTaintStep-py3/test_string.py
4+
base64.a85encode(bs) # $encodeInput=bs $encodeOutput=Attribute() $encodeFormat=Ascii85
5+
base64.b85encode(bs)# $encodeInput=bs $encodeOutput=Attribute() $encodeFormat=Base85
6+
base64.encodebytes(bs)# $encodeInput=bs $encodeOutput=Attribute() $encodeFormat=Base64
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,15 @@
11
import pickle
22
import marshal
3+
import base64
34

45
pickle.loads(payload) # $decodeInput=payload $decodeOutput=Attribute() $decodeFormat=pickle $decodeMayExecuteInput
56
marshal.loads(payload) # $decodeInput=payload $decodeOutput=Attribute() $decodeFormat=marshal $decodeMayExecuteInput
7+
8+
# TODO: These tests should be merged with python/ql/test/experimental/dataflow/tainttracking/defaultAdditionalTaintStep/test_string.py
9+
base64.b64decode(payload) # $decodeInput=payload $decodeOutput=Attribute() $decodeFormat=Base64
10+
base64.standard_b64decode(payload) # $decodeInput=payload $decodeOutput=Attribute() $decodeFormat=Base64
11+
base64.urlsafe_b64decode(payload) # $decodeInput=payload $decodeOutput=Attribute() $decodeFormat=Base64
12+
base64.b32decode(payload) # $decodeInput=payload $decodeOutput=Attribute() $decodeFormat=Base32
13+
base64.b16decode(payload) # $decodeInput=payload $decodeOutput=Attribute() $decodeFormat=Base16
14+
# deprecated since Python 3.1, but still works
15+
base64.decodestring(payload) # $decodeInput=payload $decodeOutput=Attribute() $decodeFormat=Base64
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import pickle
2+
import marshal
3+
import base64
4+
5+
pickle.dumps(obj) # $f-:encodeInput=obj $f-:encodeOutput=Attribute() $f-:encodeFormat=pickle $f-:encodeMayExecuteInput
6+
marshal.dumps(obj) # $f-:encodeInput=obj $f-:encodeOutput=Attribute() $f-:encodeFormat=marshal $f-:encodeMayExecuteInput
7+
8+
# TODO: These tests should be merged with python/ql/test/experimental/dataflow/tainttracking/defaultAdditionalTaintStep/test_string.py
9+
base64.b64encode(bs) # $encodeInput=bs $encodeOutput=Attribute() $encodeFormat=Base64
10+
base64.standard_b64encode(bs) # $encodeInput=bs $encodeOutput=Attribute() $encodeFormat=Base64
11+
base64.urlsafe_b64encode(bs) # $encodeInput=bs $encodeOutput=Attribute() $encodeFormat=Base64
12+
base64.b32encode(bs) # $encodeInput=bs $encodeOutput=Attribute() $encodeFormat=Base32
13+
base64.b16encode(bs) # $encodeInput=bs $encodeOutput=Attribute() $encodeFormat=Base16
14+
# deprecated since Python 3.1, but still works
15+
base64.encodestring(bs) # $encodeInput=bs $encodeOutput=Attribute() $encodeFormat=Base64

python/ql/test/experimental/meta/ConceptsTest.qll

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,38 @@ class DecodingTest extends InlineExpectationsTest {
7373
}
7474
}
7575

76+
class EncodingTest extends InlineExpectationsTest {
77+
EncodingTest() { this = "EncodingTest" }
78+
79+
override string getARelevantTag() { result in ["encodeInput", "encodeOutput", "encodeFormat"] }
80+
81+
override predicate hasActualResult(Location ___location, string element, string tag, string value) {
82+
exists(___location.getFile().getRelativePath()) and
83+
exists(Encoding e |
84+
exists(DataFlow::Node data |
85+
___location = data.getLocation() and
86+
element = data.toString() and
87+
value = value_from_expr(data.asExpr()) and
88+
(
89+
data = e.getAnInput() and
90+
tag = "encodeInput"
91+
or
92+
data = e.getOutput() and
93+
tag = "encodeOutput"
94+
)
95+
)
96+
or
97+
exists(string format |
98+
___location = e.getLocation() and
99+
element = format and
100+
value = format and
101+
format = e.getFormat() and
102+
tag = "encodeFormat"
103+
)
104+
)
105+
}
106+
}
107+
76108
class CodeExecutionTest extends InlineExpectationsTest {
77109
CodeExecutionTest() { this = "CodeExecutionTest" }
78110

0 commit comments

Comments
 (0)