regex exercise

sksehdev · sksehdev · commit 18f7512d7d4c · 2019-11-09T20:29:06.000-05:00
diff --git a/days/28-30-regex/Bite2_Regex.py b/days/28-30-regex/Bite2_Regex.py
@@ -0,0 +1,57 @@
+import re
+
+COURSE = ('Introduction 1 Lecture 01:47'
+          'The Basics 4 Lectures 32:03'
+          'Getting Technical!  4 Lectures 41:51'
+          'Challenge 2 Lectures 27:48'
+          'Afterword 1 Lecture 05:02')
+TWEET = ('New PyBites article: Module of the Week - Requests-cache '
+         'for Repeated API Calls - http://pybit.es/requests-cache.html '
+         '#python #APIs')
+HTML = ('<p>pybites != greedy</p>'
+        '<p>not the same can be said REgarding ...</p>')
+
+
+def extract_course_times(course=COURSE):
+    """Return the course timings from the passed in
+       course string. Timings are in mm:ss (minutes:seconds)
+       format, so taking COURSE above you would extract:
+       ['01:47', '32:03', '41:51', '27:48', '05:02']
+       Return this list.
+    """
+    reg1 = re.compile(r'\d\d:\d\d')
+    return reg1.findall(course)
+
+    pass
+
+
+def get_all_hashtags_and_links(tweet=TWEET):
+    """Get all hashtags and links from the tweet text
+       that is passed into this function. So for TWEET
+       above you need to extract the following list:
+       ['http://pybit.es/requests-cache.html',
+        '#python',
+        '#APIs']
+       Return this list.
+    """
+    reg1 = re.compile(r'(?:#|http)\S+')
+    return reg1.findall(tweet)
+
+    pass
+
+
+def match_first_paragraph(html=HTML):
+    """Extract the first paragraph of the passed in
+       html, so for HTML above this would be:
+       'pybites != greedy' (= content of first paragraph).
+       Return this string.
+    """
+    # reg1 = re.compile(r'<p>(.*?)</p>')
+    # return reg1.findall(html)[0]
+    print(re.sub(r'^<p>(.*?)</p>.*$', r" \1", html))
+    print(html)
+    pass
+
+extract_course_times()
+get_all_hashtags_and_links()
+match_first_paragraph()