shlex/
lib.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
// Copyright 2015 Nicholas Allegra (comex).
// Licensed under the Apache License, Version 2.0 <https://www.apache.org/licenses/LICENSE-2.0> or
// the MIT license <https://opensource.org/licenses/MIT>, at your option. This file may not be
// copied, modified, or distributed except according to those terms.

//! Parse strings like, and escape strings for, POSIX shells.
//!
//! Same idea as (but implementation not directly based on) the Python shlex module.
//!
//! Disabling the `std` feature (which is enabled by default) will allow the crate to work in
//! `no_std` environments, where the `alloc` crate, and a global allocator, are available.
//!
//! ## <span style="color:red">Warning</span>
//!
//! The [`try_quote`]/[`try_join`] family of APIs does not quote control characters (because they
//! cannot be quoted portably).
//!
//! This is fully safe in noninteractive contexts, like shell scripts and `sh -c` arguments (or
//! even scripts `source`d from interactive shells).
//!
//! But if you are quoting for human consumption, you should keep in mind that ugly inputs produce
//! ugly outputs (which may not be copy-pastable).
//!
//! And if by chance you are piping the output of [`try_quote`]/[`try_join`] directly to the stdin
//! of an interactive shell, you should stop, because control characters can lead to arbitrary
//! command injection.
//!
//! For more information, and for information about more minor issues, please see [quoting_warning].
//!
//! ## Compatibility
//!
//! This crate's quoting functionality tries to be compatible with **any POSIX-compatible shell**;
//! it's tested against `bash`, `zsh`, `dash`, Busybox `ash`, and `mksh`, plus `fish` (which is not
//! POSIX-compatible but close enough).
//!
//! It also aims to be compatible with Python `shlex` and C `wordexp`.

#![cfg_attr(not(feature = "std"), no_std)]

extern crate alloc;
use alloc::vec::Vec;
use alloc::borrow::Cow;
use alloc::string::String;
#[cfg(test)]
use alloc::vec;
#[cfg(test)]
use alloc::borrow::ToOwned;

pub mod bytes;
#[cfg(all(doc, not(doctest)))]
#[path = "quoting_warning.md"]
pub mod quoting_warning;

/// An iterator that takes an input string and splits it into the words using the same syntax as
/// the POSIX shell.
///
/// See [`bytes::Shlex`].
pub struct Shlex<'a>(bytes::Shlex<'a>);

impl<'a> Shlex<'a> {
    pub fn new(in_str: &'a str) -> Self {
        Self(bytes::Shlex::new(in_str.as_bytes()))
    }
}

impl<'a> Iterator for Shlex<'a> {
    type Item = String;
    fn next(&mut self) -> Option<String> {
        self.0.next().map(|byte_word| {
            // Safety: given valid UTF-8, bytes::Shlex will always return valid UTF-8.
            unsafe { String::from_utf8_unchecked(byte_word) }
        })
    }
}

impl<'a> core::ops::Deref for Shlex<'a> {
    type Target = bytes::Shlex<'a>;

    fn deref(&self) -> &Self::Target {
        &self.0
    }
}

impl<'a> core::ops::DerefMut for Shlex<'a> {
    fn deref_mut(&mut self) -> &mut Self::Target {
        &mut self.0
    }
}

/// Convenience function that consumes the whole string at once.  Returns None if the input was
/// erroneous.
pub fn split(in_str: &str) -> Option<Vec<String>> {
    let mut shl = Shlex::new(in_str);
    let res = shl.by_ref().collect();
    if shl.had_error { None } else { Some(res) }
}

/// Errors from [`Quoter::quote`], [`Quoter::join`], etc. (and their [`bytes`] counterparts).
///
/// By default, the only error that can be returned is [`QuoteError::Nul`].  If you call
/// `allow_nul(true)`, then no errors can be returned at all.  Any error variants added in the
/// future will not be enabled by default; they will be enabled through corresponding non-default
/// [`Quoter`] options.
///
/// ...In theory.  In the unlikely event that additional classes of inputs are discovered that,
/// like nul bytes, are fundamentally unsafe to quote even for non-interactive shells, the risk
/// will be mitigated by adding corresponding [`QuoteError`] variants that *are* enabled by
/// default.
#[non_exhaustive]
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum QuoteError {
    /// The input contained a nul byte.  In most cases, shells fundamentally [cannot handle strings
    /// containing nul bytes](quoting_warning#nul-bytes), no matter how they are quoted.  But if
    /// you're sure you can handle nul bytes, you can call `allow_nul(true)` on the `Quoter` to let
    /// them pass through.
    Nul,
}

impl core::fmt::Display for QuoteError {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        match self {
            QuoteError::Nul => f.write_str("cannot shell-quote string containing nul byte"),
        }
    }
}

#[cfg(feature = "std")]
impl std::error::Error for QuoteError {}

/// A more configurable interface to quote strings.  If you only want the default settings you can
/// use the convenience functions [`try_quote`] and [`try_join`].
///
/// The bytes equivalent is [`bytes::Quoter`].
#[derive(Default, Debug, Clone)]
pub struct Quoter {
    inner: bytes::Quoter,
}

impl Quoter {
    /// Create a new [`Quoter`] with default settings.
    #[inline]
    pub fn new() -> Self {
        Self::default()
    }

    /// Set whether to allow [nul bytes](quoting_warning#nul-bytes).  By default they are not
    /// allowed and will result in an error of [`QuoteError::Nul`].
    #[inline]
    pub fn allow_nul(mut self, allow: bool) -> Self {
        self.inner = self.inner.allow_nul(allow);
        self
    }

    /// Convenience function that consumes an iterable of words and turns it into a single string,
    /// quoting words when necessary. Consecutive words will be separated by a single space.
    pub fn join<'a, I: IntoIterator<Item = &'a str>>(&self, words: I) -> Result<String, QuoteError> {
        // Safety: given valid UTF-8, bytes::join() will always return valid UTF-8.
        self.inner.join(words.into_iter().map(|s| s.as_bytes()))
            .map(|bytes| unsafe { String::from_utf8_unchecked(bytes) })
    }

    /// Given a single word, return a string suitable to encode it as a shell argument.
    pub fn quote<'a>(&self, in_str: &'a str) -> Result<Cow<'a, str>, QuoteError> {
        Ok(match self.inner.quote(in_str.as_bytes())? {
            Cow::Borrowed(out) => {
                // Safety: given valid UTF-8, bytes::quote() will always return valid UTF-8.
                unsafe { core::str::from_utf8_unchecked(out) }.into()
            }
            Cow::Owned(out) => {
                // Safety: given valid UTF-8, bytes::quote() will always return valid UTF-8.
                unsafe { String::from_utf8_unchecked(out) }.into()
            }
        })
    }
}

impl From<bytes::Quoter> for Quoter {
    fn from(inner: bytes::Quoter) -> Quoter {
        Quoter { inner }
    }
}

impl From<Quoter> for bytes::Quoter {
    fn from(quoter: Quoter) -> bytes::Quoter {
        quoter.inner
    }
}

/// Convenience function that consumes an iterable of words and turns it into a single string,
/// quoting words when necessary. Consecutive words will be separated by a single space.
///
/// Uses default settings except that nul bytes are passed through, which [may be
/// dangerous](quoting_warning#nul-bytes), leading to this function being deprecated.
///
/// Equivalent to [`Quoter::new().allow_nul(true).join(words).unwrap()`](Quoter).
///
/// (That configuration never returns `Err`, so this function does not panic.)
///
/// The bytes equivalent is [bytes::join].
#[deprecated(since = "1.3.0", note = "replace with `try_join(words)?` to avoid nul byte danger")]
pub fn join<'a, I: IntoIterator<Item = &'a str>>(words: I) -> String {
    Quoter::new().allow_nul(true).join(words).unwrap()
}

/// Convenience function that consumes an iterable of words and turns it into a single string,
/// quoting words when necessary. Consecutive words will be separated by a single space.
///
/// Uses default settings.  The only error that can be returned is [`QuoteError::Nul`].
///
/// Equivalent to [`Quoter::new().join(words)`](Quoter).
///
/// The bytes equivalent is [bytes::try_join].
pub fn try_join<'a, I: IntoIterator<Item = &'a str>>(words: I) -> Result<String, QuoteError> {
    Quoter::new().join(words)
}

/// Given a single word, return a string suitable to encode it as a shell argument.
///
/// Uses default settings except that nul bytes are passed through, which [may be
/// dangerous](quoting_warning#nul-bytes), leading to this function being deprecated.
///
/// Equivalent to [`Quoter::new().allow_nul(true).quote(in_str).unwrap()`](Quoter).
///
/// (That configuration never returns `Err`, so this function does not panic.)
///
/// The bytes equivalent is [bytes::quote].
#[deprecated(since = "1.3.0", note = "replace with `try_quote(str)?` to avoid nul byte danger")]
pub fn quote(in_str: &str) -> Cow<str> {
    Quoter::new().allow_nul(true).quote(in_str).unwrap()
}

/// Given a single word, return a string suitable to encode it as a shell argument.
///
/// Uses default settings.  The only error that can be returned is [`QuoteError::Nul`].
///
/// Equivalent to [`Quoter::new().quote(in_str)`](Quoter).
///
/// (That configuration never returns `Err`, so this function does not panic.)
///
/// The bytes equivalent is [bytes::try_quote].
pub fn try_quote(in_str: &str) -> Result<Cow<str>, QuoteError> {
    Quoter::new().quote(in_str)
}

#[cfg(test)]
static SPLIT_TEST_ITEMS: &'static [(&'static str, Option<&'static [&'static str]>)] = &[
    ("foo$baz", Some(&["foo$baz"])),
    ("foo baz", Some(&["foo", "baz"])),
    ("foo\"bar\"baz", Some(&["foobarbaz"])),
    ("foo \"bar\"baz", Some(&["foo", "barbaz"])),
    ("   foo \nbar", Some(&["foo", "bar"])),
    ("foo\\\nbar", Some(&["foobar"])),
    ("\"foo\\\nbar\"", Some(&["foobar"])),
    ("'baz\\$b'", Some(&["baz\\$b"])),
    ("'baz\\\''", None),
    ("\\", None),
    ("\"\\", None),
    ("'\\", None),
    ("\"", None),
    ("'", None),
    ("foo #bar\nbaz", Some(&["foo", "baz"])),
    ("foo #bar", Some(&["foo"])),
    ("foo#bar", Some(&["foo#bar"])),
    ("foo\"#bar", None),
    ("'\\n'", Some(&["\\n"])),
    ("'\\\\n'", Some(&["\\\\n"])),
];

#[test]
fn test_split() {
    for &(input, output) in SPLIT_TEST_ITEMS {
        assert_eq!(split(input), output.map(|o| o.iter().map(|&x| x.to_owned()).collect()));
    }
}

#[test]
fn test_lineno() {
    let mut sh = Shlex::new("\nfoo\nbar");
    while let Some(word) = sh.next() {
        if word == "bar" {
            assert_eq!(sh.line_no, 3);
        }
    }
}

#[test]
#[cfg_attr(not(feature = "std"), allow(unreachable_code, unused_mut))]
fn test_quote() {
    // This is a list of (unquoted, quoted) pairs.
    // But it's using a single long (raw) string literal with an ad-hoc format, just because it's
    // hard to read if we have to put the test strings through Rust escaping on top of the escaping
    // being tested.  (Even raw string literals are noisy for short strings).
    // Ad-hoc: "NL" is replaced with a literal newline; no other escape sequences.
    let tests = r#"
        <>                => <''>
        <foobar>          => <foobar>
        <foo bar>         => <'foo bar'>
        <"foo bar'">      => <"\"foo bar'\"">
        <'foo bar'>       => <"'foo bar'">
        <">               => <'"'>
        <"'>              => <"\"'">
        <hello!world>     => <'hello!world'>
        <'hello!world>    => <"'hello"'!world'>
        <'hello!>         => <"'hello"'!'>
        <hello ^ world>   => <'hello ''^ world'>
        <hello^>          => <hello'^'>
        <!world'>         => <'!world'"'">
        <{a, b}>          => <'{a, b}'>
        <NL>              => <'NL'>
        <^>               => <'^'>
        <foo^bar>         => <foo'^bar'>
        <NLx^>            => <'NLx''^'>
        <NL^x>            => <'NL''^x'>
        <NL ^x>           => <'NL ''^x'>
        <{a,b}>           => <'{a,b}'>
        <a,b>             => <'a,b'>
        <a..b             => <a..b>
        <'$>              => <"'"'$'>
        <"^>              => <'"''^'>
    "#;
    let mut ok = true;
    for test in tests.trim().split('\n') {
        let parts: Vec<String> = test
            .replace("NL", "\n")
            .split("=>")
            .map(|part| part.trim().trim_start_matches('<').trim_end_matches('>').to_owned())
            .collect();
        assert!(parts.len() == 2);
        let unquoted = &*parts[0];
        let quoted_expected = &*parts[1];
        let quoted_actual = try_quote(&parts[0]).unwrap();
        if quoted_expected != quoted_actual {
            #[cfg(not(feature = "std"))]
            panic!("FAIL: for input <{}>, expected <{}>, got <{}>",
                     unquoted, quoted_expected, quoted_actual);
            #[cfg(feature = "std")]
            println!("FAIL: for input <{}>, expected <{}>, got <{}>",
                     unquoted, quoted_expected, quoted_actual);
            ok = false;
        }
    }
    assert!(ok);
}

#[test]
#[allow(deprecated)]
fn test_join() {
    assert_eq!(join(vec![]), "");
    assert_eq!(join(vec![""]), "''");
    assert_eq!(join(vec!["a", "b"]), "a b");
    assert_eq!(join(vec!["foo bar", "baz"]), "'foo bar' baz");
}

#[test]
fn test_fallible() {
    assert_eq!(try_join(vec!["\0"]), Err(QuoteError::Nul));
    assert_eq!(try_quote("\0"), Err(QuoteError::Nul));
}