Regular Expressions
Add these crates to your own project:
cargo add lazy_static regex stringreader
Verify and extract login from an email address
Validates that an email address is formatted correctly, and extracts everything before the @ symbol.
use lazy_static::lazy_static; use regex::Regex; fn extract_login(input: &str) -> Option<&str> { lazy_static! { static ref RE: Regex = Regex::new(r"(?x) ^(?P<login>[^@\s]+)@ ([[:word:]]+\.)* [[:word:]]+$ ").unwrap(); } RE.captures(input).and_then(|cap| { cap.name("login").map(|login| login.as_str()) }) } fn main() { assert_eq!(extract_login(r"I❤email@example.com"), Some(r"I❤email")); assert_eq!( extract_login(r"sdf+sdsfsd.as.sdsd@jhkk.d.rl"), Some(r"sdf+sdsfsd.as.sdsd") ); assert_eq!(extract_login(r"More@Than@One@at.com"), None); assert_eq!(extract_login(r"Not an email@email"), None); }
Extract a list of unique #Hashtags from a text
Extracts, sorts, and deduplicates list of hashtags from text.
The hashtag regex given here only catches Latin hashtags that start with a letter. The complete twitter hashtag regex is much more complicated.
use lazy_static::lazy_static; use regex::Regex; use std::collections::HashSet; fn extract_hashtags(text: &str) -> HashSet<&str> { lazy_static! { static ref HASHTAG_REGEX : Regex = Regex::new( r"\#[a-zA-Z][0-9a-zA-Z_]*" ).unwrap(); } HASHTAG_REGEX.find_iter(text).map(|mat| mat.as_str()).collect() } fn main() { let tweet = "Hey #world, I just got my new #dog, say hello to Till. #dog #forever #2 #_ "; let tags = extract_hashtags(tweet); assert!(tags.contains("#dog") && tags.contains("#forever") && tags.contains("#world")); assert_eq!(tags.len(), 3); println!("{:?}", tags); }
Extract phone numbers from text
Processes a string of text using Regex::captures_iter
to capture multiple
phone numbers. The example here is for US convention phone numbers.
use regex::Regex; use std::error::Error; use std::fmt; struct PhoneNumber<'a> { area: &'a str, exchange: &'a str, subscriber: &'a str, } impl<'a> fmt::Display for PhoneNumber<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "1 ({}) {}-{}", self.area, self.exchange, self.subscriber) } } fn main() -> Result<(), Box<dyn Error>> { let phone_text = " +1 505 881 9292 (v) +1 505 778 2212 (c) +1 505 881 9297 (f) (202) 991 9534 Alex 5553920011 1 (800) 233-2010 1.299.339.1020"; let re = Regex::new( r#"(?x) (?:\+?1)? # Country Code Optional [\s\.]? (([2-9]\d{2})|\(([2-9]\d{2})\)) # Area Code [\s\.\-]? ([2-9]\d{2}) # Exchange Code [\s\.\-]? (\d{4}) # Subscriber Number"#, )?; let phone_numbers = re.captures_iter(phone_text).filter_map(|cap| { let groups = (cap.get(2).or(cap.get(3)), cap.get(4), cap.get(5)); match groups { (Some(area), Some(ext), Some(sub)) => Some(PhoneNumber { area: area.as_str(), exchange: ext.as_str(), subscriber: sub.as_str(), }), _ => None, } }); assert_eq!( phone_numbers.map(|m| m.to_string()).collect::<Vec<_>>(), vec![ "1 (505) 881-9292", "1 (505) 778-2212", "1 (505) 881-9297", "1 (202) 991-9534", "1 (555) 392-0011", "1 (800) 233-2010", "1 (299) 339-1020", ] ); Ok(()) }
Filter a log file by matching multiple regular expressions
Reads file contents (stored as a string in the program itself) and only outputs the lines containing “version X.X.X”, some IP address followed by port 443 (e.g. “192.168.0.1:443”), or a specific warning.
The example can be modified to read from an actual file.
A regex::RegexSetBuilder
composes a regex::RegexSet
.
Since backslashes are very common in regular expressions, using
raw string literals makes them more readable.
use std::io::{BufReader, BufRead}; use regex::RegexSetBuilder; use std::error::Error; use stringreader::StringReader; fn main() -> Result<(), Box<dyn Error>> { let simulated_file_contents = r#" WARNING: Timeout Expired! 192.168.0.1:443 https://10.18.77.106:443 version "1.2.3" "#; let str_reader = StringReader::new(simulated_file_contents); let buffered = BufReader::new(str_reader); // // To filter from a file on the filesystem instead: // // use std::fs::File; // let buffered = BufReader::new(File::open("my_logfile.txt)?); // let set = RegexSetBuilder::new(&[ r#"version "\d\.\d\.\d""#, r#"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:443"#, r#"warning.*timeout expired"#, ]).case_insensitive(true) .build()?; buffered .lines() .filter_map(|line| line.ok()) .filter(|line| set.is_match(line.as_str())) .for_each(|x| println!("{}", x)); Ok(()) }
Replace all occurrences of one text pattern with another pattern.
Replaces all occurrences of the American English date pattern
MM/DD/YYYY with the equivalent ISO 8601 standard YYYY-MM-DD date format.
For example 01/15/2013
becomes 2013-01-15
.
The method Regex::replace_all
replaces all occurrences of the whole regex.
&str
implements the Replacer
trait which allows variables like $abcde
to
refer to corresponding named capture groups (?P<abcde>REGEX)
from the search
regex. See the replacement string syntax for examples and escaping detail.
use lazy_static::lazy_static; use std::borrow::Cow; use regex::Regex; fn reformat_dates(before: &str) -> Cow<str> { lazy_static! { static ref USA_DATE_REGEX : Regex = Regex::new( r"(?P<m>\d{2})/(?P<d>\d{2})/(?P<y>\d{4})" ).unwrap(); } USA_DATE_REGEX.replace_all(before, "$y-$m-$d") } fn main() { let before = "apple 03/14/2012, cherry 01/15/2013 and banana 07/05/2014"; let after = reformat_dates(before); assert_eq!(after, "apple 2012-03-14, cherry 2013-01-15 and banana 2014-07-05"); }