lzma_rs/decode/
lzma.rs

1use crate::decode::lzbuffer::{LzBuffer, LzCircularBuffer};
2use crate::decode::rangecoder::{BitTree, LenDecoder, RangeDecoder};
3use crate::decompress::{Options, UnpackedSize};
4use crate::error;
5use crate::util::vec2d::Vec2D;
6use byteorder::{LittleEndian, ReadBytesExt};
7use std::io;
8
9/// Maximum input data that can be processed in one iteration.
10/// Libhtp uses the following equation to define the maximum number of bits
11/// for the worst case scenario:
12///   log2((2^11 / 31) ^ 22) + 26 < 134 + 26 = 160
13const MAX_REQUIRED_INPUT: usize = 20;
14
15/// Processing mode for decompression.
16///
17/// Tells the decompressor if we should expect more data after parsing the
18/// current input.
19#[derive(Debug, PartialEq)]
20enum ProcessingMode {
21    /// Streaming mode. Process the input bytes but assume there will be more
22    /// chunks of input data to receive in future calls to `process_mode()`.
23    Partial,
24    /// Synchronous mode. Process the input bytes and confirm end of stream has been reached.
25    /// Use this mode if you are processing a fixed buffer of compressed data, or after
26    /// using `Mode::Partial` to check for the end of stream.
27    Finish,
28}
29
30/// Result of the next iteration of processing.
31///
32/// Indicates whether processing should continue or is finished.
33#[derive(Debug, PartialEq)]
34enum ProcessingStatus {
35    Continue,
36    Finished,
37}
38
39#[derive(Debug, Copy, Clone)]
40/// LZMA 'lclppb' decompression properties.
41pub struct LzmaProperties {
42    /// The number of literal context bits.
43    ///
44    /// The most `lc` significant bits of the previous byte are part of the literal context.
45    /// `lc` must not be greater than 8.
46    pub lc: u32, // 0..=8
47    /// The number of literal position bits.
48    ///
49    /// `lp` must not be greater than 4.
50    pub lp: u32, // 0..=4
51    /// The number of position bits.
52    ///
53    /// The context for literal/match is plaintext offset modulo `2^pb`.
54    /// `pb` must not be greater than 4.
55    pub pb: u32, // 0..=4
56}
57
58impl LzmaProperties {
59    /// Assert the validity of the LZMA properties.
60    pub(crate) fn validate(&self) {
61        assert!(self.lc <= 8);
62        assert!(self.lp <= 4);
63        assert!(self.pb <= 4);
64    }
65}
66
67#[derive(Debug, Copy, Clone)]
68/// LZMA decompression parameters.
69pub struct LzmaParams {
70    /// The LZMA 'lclppb' decompression properties.
71    pub(crate) properties: LzmaProperties,
72    /// The dictionary size to use when decompressing.
73    pub(crate) dict_size: u32,
74    /// The size of the unpacked data.
75    pub(crate) unpacked_size: Option<u64>,
76}
77
78impl LzmaParams {
79    /// Create an new instance of LZMA parameters.
80    #[cfg(feature = "raw_decoder")]
81    pub fn new(
82        properties: LzmaProperties,
83        dict_size: u32,
84        unpacked_size: Option<u64>,
85    ) -> LzmaParams {
86        Self {
87            properties,
88            dict_size,
89            unpacked_size,
90        }
91    }
92
93    /// Read LZMA parameters from the LZMA stream header.
94    pub fn read_header<R>(input: &mut R, options: &Options) -> error::Result<LzmaParams>
95    where
96        R: io::BufRead,
97    {
98        // Properties
99        let props = input.read_u8().map_err(error::Error::HeaderTooShort)?;
100
101        let mut pb = props as u32;
102        if pb >= 225 {
103            return Err(error::Error::LzmaError(format!(
104                "LZMA header invalid properties: {} must be < 225",
105                pb
106            )));
107        }
108
109        let lc: u32 = pb % 9;
110        pb /= 9;
111        let lp: u32 = pb % 5;
112        pb /= 5;
113
114        lzma_info!("Properties {{ lc: {}, lp: {}, pb: {} }}", lc, lp, pb);
115
116        // Dictionary
117        let dict_size_provided = input
118            .read_u32::<LittleEndian>()
119            .map_err(error::Error::HeaderTooShort)?;
120        let dict_size = if dict_size_provided < 0x1000 {
121            0x1000
122        } else {
123            dict_size_provided
124        };
125
126        lzma_info!("Dict size: {}", dict_size);
127
128        // Unpacked size
129        let unpacked_size: Option<u64> = match options.unpacked_size {
130            UnpackedSize::ReadFromHeader => {
131                let unpacked_size_provided = input
132                    .read_u64::<LittleEndian>()
133                    .map_err(error::Error::HeaderTooShort)?;
134                let marker_mandatory: bool = unpacked_size_provided == 0xFFFF_FFFF_FFFF_FFFF;
135                if marker_mandatory {
136                    None
137                } else {
138                    Some(unpacked_size_provided)
139                }
140            }
141            UnpackedSize::ReadHeaderButUseProvided(x) => {
142                input
143                    .read_u64::<LittleEndian>()
144                    .map_err(error::Error::HeaderTooShort)?;
145                x
146            }
147            UnpackedSize::UseProvided(x) => x,
148        };
149
150        lzma_info!("Unpacked size: {:?}", unpacked_size);
151
152        let params = LzmaParams {
153            properties: LzmaProperties { lc, lp, pb },
154            dict_size,
155            unpacked_size,
156        };
157
158        Ok(params)
159    }
160}
161
162#[derive(Debug)]
163pub(crate) struct DecoderState {
164    // Buffer input data here if we need more for decompression. Up to
165    // MAX_REQUIRED_INPUT bytes can be consumed during one iteration.
166    partial_input_buf: std::io::Cursor<[u8; MAX_REQUIRED_INPUT]>,
167    pub(crate) lzma_props: LzmaProperties,
168    unpacked_size: Option<u64>,
169    literal_probs: Vec2D<u16>,
170    pos_slot_decoder: [BitTree; 4],
171    align_decoder: BitTree,
172    pos_decoders: [u16; 115],
173    is_match: [u16; 192], // true = LZ, false = literal
174    is_rep: [u16; 12],
175    is_rep_g0: [u16; 12],
176    is_rep_g1: [u16; 12],
177    is_rep_g2: [u16; 12],
178    is_rep_0long: [u16; 192],
179    state: usize,
180    rep: [usize; 4],
181    len_decoder: LenDecoder,
182    rep_len_decoder: LenDecoder,
183}
184
185impl DecoderState {
186    pub fn new(lzma_props: LzmaProperties, unpacked_size: Option<u64>) -> Self {
187        lzma_props.validate();
188        DecoderState {
189            partial_input_buf: std::io::Cursor::new([0; MAX_REQUIRED_INPUT]),
190            lzma_props,
191            unpacked_size,
192            literal_probs: Vec2D::init(0x400, (1 << (lzma_props.lc + lzma_props.lp), 0x300)),
193            pos_slot_decoder: [
194                BitTree::new(6),
195                BitTree::new(6),
196                BitTree::new(6),
197                BitTree::new(6),
198            ],
199            align_decoder: BitTree::new(4),
200            pos_decoders: [0x400; 115],
201            is_match: [0x400; 192],
202            is_rep: [0x400; 12],
203            is_rep_g0: [0x400; 12],
204            is_rep_g1: [0x400; 12],
205            is_rep_g2: [0x400; 12],
206            is_rep_0long: [0x400; 192],
207            state: 0,
208            rep: [0; 4],
209            len_decoder: LenDecoder::new(),
210            rep_len_decoder: LenDecoder::new(),
211        }
212    }
213
214    pub fn reset_state(&mut self, new_props: LzmaProperties) {
215        new_props.validate();
216        if self.lzma_props.lc + self.lzma_props.lp == new_props.lc + new_props.lp {
217            // We can reset here by filling the existing buffer with 0x400.
218            self.literal_probs.fill(0x400);
219        } else {
220            // We need to reallocate because of the new size of `lc+lp`.
221            self.literal_probs = Vec2D::init(0x400, (1 << (new_props.lc + new_props.lp), 0x300));
222        }
223
224        self.lzma_props = new_props;
225        self.pos_slot_decoder.iter_mut().for_each(|t| t.reset());
226        self.align_decoder.reset();
227        // For stack-allocated arrays, it was found to be faster to re-create new arrays
228        // dropping the existing one, rather than using `fill` to reset the contents to zero.
229        // Heap-based arrays use fill to keep their allocation rather than reallocate.
230        self.pos_decoders = [0x400; 115];
231        self.is_match = [0x400; 192];
232        self.is_rep = [0x400; 12];
233        self.is_rep_g0 = [0x400; 12];
234        self.is_rep_g1 = [0x400; 12];
235        self.is_rep_g2 = [0x400; 12];
236        self.is_rep_0long = [0x400; 192];
237        self.state = 0;
238        self.rep = [0; 4];
239        self.len_decoder.reset();
240        self.rep_len_decoder.reset();
241    }
242
243    pub fn set_unpacked_size(&mut self, unpacked_size: Option<u64>) {
244        self.unpacked_size = unpacked_size;
245    }
246
247    pub fn process<'a, W: io::Write, LZB: LzBuffer<W>, R: io::BufRead>(
248        &mut self,
249        output: &mut LZB,
250        rangecoder: &mut RangeDecoder<'a, R>,
251    ) -> error::Result<()> {
252        self.process_mode(output, rangecoder, ProcessingMode::Finish)
253    }
254
255    #[cfg(feature = "stream")]
256    pub fn process_stream<'a, W: io::Write, LZB: LzBuffer<W>, R: io::BufRead>(
257        &mut self,
258        output: &mut LZB,
259        rangecoder: &mut RangeDecoder<'a, R>,
260    ) -> error::Result<()> {
261        self.process_mode(output, rangecoder, ProcessingMode::Partial)
262    }
263
264    /// Process the next iteration of the loop.
265    ///
266    /// If the update flag is true, the decoder's state will be updated.
267    ///
268    /// Returns `ProcessingStatus` to determine whether one should continue
269    /// processing the loop.
270    fn process_next_inner<'a, W: io::Write, LZB: LzBuffer<W>, R: io::BufRead>(
271        &mut self,
272        output: &mut LZB,
273        rangecoder: &mut RangeDecoder<'a, R>,
274        update: bool,
275    ) -> error::Result<ProcessingStatus> {
276        let pos_state = output.len() & ((1 << self.lzma_props.pb) - 1);
277
278        // Literal
279        if !rangecoder.decode_bit(
280            // TODO: assumes pb = 2 ??
281            &mut self.is_match[(self.state << 4) + pos_state],
282            update,
283        )? {
284            let byte: u8 = self.decode_literal(output, rangecoder, update)?;
285
286            if update {
287                lzma_debug!("Literal: {}", byte);
288                output.append_literal(byte)?;
289
290                self.state = if self.state < 4 {
291                    0
292                } else if self.state < 10 {
293                    self.state - 3
294                } else {
295                    self.state - 6
296                };
297            }
298            return Ok(ProcessingStatus::Continue);
299        }
300
301        // LZ
302        let mut len: usize;
303        // Distance is repeated from LRU
304        if rangecoder.decode_bit(&mut self.is_rep[self.state], update)? {
305            // dist = rep[0]
306            if !rangecoder.decode_bit(&mut self.is_rep_g0[self.state], update)? {
307                // len = 1
308                if !rangecoder.decode_bit(
309                    &mut self.is_rep_0long[(self.state << 4) + pos_state],
310                    update,
311                )? {
312                    // update state (short rep)
313                    if update {
314                        self.state = if self.state < 7 { 9 } else { 11 };
315                        let dist = self.rep[0] + 1;
316                        output.append_lz(1, dist)?;
317                    }
318                    return Ok(ProcessingStatus::Continue);
319                }
320            // dist = rep[i]
321            } else {
322                let idx: usize;
323                if !rangecoder.decode_bit(&mut self.is_rep_g1[self.state], update)? {
324                    idx = 1;
325                } else if !rangecoder.decode_bit(&mut self.is_rep_g2[self.state], update)? {
326                    idx = 2;
327                } else {
328                    idx = 3;
329                }
330                if update {
331                    // Update LRU
332                    let dist = self.rep[idx];
333                    for i in (0..idx).rev() {
334                        self.rep[i + 1] = self.rep[i];
335                    }
336                    self.rep[0] = dist
337                }
338            }
339
340            len = self.rep_len_decoder.decode(rangecoder, pos_state, update)?;
341
342            if update {
343                // update state (rep)
344                self.state = if self.state < 7 { 8 } else { 11 };
345            }
346        // New distance
347        } else {
348            if update {
349                // Update LRU
350                self.rep[3] = self.rep[2];
351                self.rep[2] = self.rep[1];
352                self.rep[1] = self.rep[0];
353            }
354
355            len = self.len_decoder.decode(rangecoder, pos_state, update)?;
356
357            if update {
358                // update state (match)
359                self.state = if self.state < 7 { 7 } else { 10 };
360            }
361
362            let rep_0 = self.decode_distance(rangecoder, len, update)?;
363
364            if update {
365                self.rep[0] = rep_0;
366                if self.rep[0] == 0xFFFF_FFFF {
367                    if rangecoder.is_finished_ok()? {
368                        return Ok(ProcessingStatus::Finished);
369                    }
370                    return Err(error::Error::LzmaError(String::from(
371                        "Found end-of-stream marker but more bytes are available",
372                    )));
373                }
374            }
375        }
376
377        if update {
378            len += 2;
379
380            let dist = self.rep[0] + 1;
381            output.append_lz(len, dist)?;
382        }
383
384        Ok(ProcessingStatus::Continue)
385    }
386
387    fn process_next<'a, W: io::Write, LZB: LzBuffer<W>, R: io::BufRead>(
388        &mut self,
389        output: &mut LZB,
390        rangecoder: &mut RangeDecoder<'a, R>,
391    ) -> error::Result<ProcessingStatus> {
392        self.process_next_inner(output, rangecoder, true)
393    }
394
395    /// Try to process the next iteration of the loop.
396    ///
397    /// This will check to see if there is enough data to consume and advance the
398    /// decompressor. Needed in streaming mode to avoid corrupting the state while
399    /// processing incomplete chunks of data.
400    fn try_process_next<W: io::Write, LZB: LzBuffer<W>>(
401        &mut self,
402        output: &mut LZB,
403        buf: &[u8],
404        range: u32,
405        code: u32,
406    ) -> error::Result<()> {
407        let mut temp = std::io::Cursor::new(buf);
408        let mut rangecoder = RangeDecoder::from_parts(&mut temp, range, code);
409        let _ = self.process_next_inner(output, &mut rangecoder, false)?;
410        Ok(())
411    }
412
413    /// Utility function to read data into the partial input buffer.
414    fn read_partial_input_buf<'a, R: io::BufRead>(
415        &mut self,
416        rangecoder: &mut RangeDecoder<'a, R>,
417    ) -> error::Result<()> {
418        // Fill as much of the tmp buffer as possible
419        let start = self.partial_input_buf.position() as usize;
420        let bytes_read =
421            rangecoder.read_into(&mut self.partial_input_buf.get_mut()[start..])? as u64;
422        self.partial_input_buf
423            .set_position(self.partial_input_buf.position() + bytes_read);
424        Ok(())
425    }
426
427    fn process_mode<'a, W: io::Write, LZB: LzBuffer<W>, R: io::BufRead>(
428        &mut self,
429        output: &mut LZB,
430        rangecoder: &mut RangeDecoder<'a, R>,
431        mode: ProcessingMode,
432    ) -> error::Result<()> {
433        loop {
434            if let Some(unpacked_size) = self.unpacked_size {
435                if output.len() as u64 >= unpacked_size {
436                    break;
437                }
438            } else if match mode {
439                ProcessingMode::Partial => {
440                    rangecoder.is_eof()? && self.partial_input_buf.position() as usize == 0
441                }
442                ProcessingMode::Finish => {
443                    rangecoder.is_finished_ok()? && self.partial_input_buf.position() as usize == 0
444                }
445            } {
446                break;
447            }
448
449            if self.partial_input_buf.position() as usize > 0 {
450                self.read_partial_input_buf(rangecoder)?;
451                let tmp = *self.partial_input_buf.get_ref();
452
453                // Check if we need more data to advance the decompressor
454                if mode == ProcessingMode::Partial
455                    && (self.partial_input_buf.position() as usize) < MAX_REQUIRED_INPUT
456                    && self
457                        .try_process_next(
458                            output,
459                            &tmp[..self.partial_input_buf.position() as usize],
460                            rangecoder.range,
461                            rangecoder.code,
462                        )
463                        .is_err()
464                {
465                    return Ok(());
466                }
467
468                // Run the decompressor on the tmp buffer
469                let mut tmp_reader =
470                    io::Cursor::new(&tmp[..self.partial_input_buf.position() as usize]);
471                let mut tmp_rangecoder =
472                    RangeDecoder::from_parts(&mut tmp_reader, rangecoder.range, rangecoder.code);
473                let res = self.process_next(output, &mut tmp_rangecoder)?;
474
475                // Update the actual rangecoder
476                rangecoder.set(tmp_rangecoder.range, tmp_rangecoder.code);
477
478                // Update tmp buffer
479                let end = self.partial_input_buf.position();
480                let new_len = end - tmp_reader.position();
481                self.partial_input_buf.get_mut()[..new_len as usize]
482                    .copy_from_slice(&tmp[tmp_reader.position() as usize..end as usize]);
483                self.partial_input_buf.set_position(new_len);
484
485                if res == ProcessingStatus::Finished {
486                    break;
487                };
488            } else {
489                let buf: &[u8] = rangecoder.stream.fill_buf()?;
490                if mode == ProcessingMode::Partial
491                    && buf.len() < MAX_REQUIRED_INPUT
492                    && self
493                        .try_process_next(output, buf, rangecoder.range, rangecoder.code)
494                        .is_err()
495                {
496                    return self.read_partial_input_buf(rangecoder);
497                }
498
499                if self.process_next(output, rangecoder)? == ProcessingStatus::Finished {
500                    break;
501                };
502            }
503        }
504
505        if let Some(len) = self.unpacked_size {
506            if mode == ProcessingMode::Finish && len != output.len() as u64 {
507                return Err(error::Error::LzmaError(format!(
508                    "Expected unpacked size of {} but decompressed to {}",
509                    len,
510                    output.len()
511                )));
512            }
513        }
514
515        Ok(())
516    }
517
518    fn decode_literal<'a, W: io::Write, LZB: LzBuffer<W>, R: io::BufRead>(
519        &mut self,
520        output: &mut LZB,
521        rangecoder: &mut RangeDecoder<'a, R>,
522        update: bool,
523    ) -> error::Result<u8> {
524        let def_prev_byte = 0u8;
525        let prev_byte = output.last_or(def_prev_byte) as usize;
526
527        let mut result: usize = 1;
528        let lit_state = ((output.len() & ((1 << self.lzma_props.lp) - 1)) << self.lzma_props.lc)
529            + (prev_byte >> (8 - self.lzma_props.lc));
530        let probs = &mut self.literal_probs[lit_state];
531
532        if self.state >= 7 {
533            let mut match_byte = output.last_n(self.rep[0] + 1)? as usize;
534
535            while result < 0x100 {
536                let match_bit = (match_byte >> 7) & 1;
537                match_byte <<= 1;
538                let bit = rangecoder
539                    .decode_bit(&mut probs[((1 + match_bit) << 8) + result], update)?
540                    as usize;
541                result = (result << 1) ^ bit;
542                if match_bit != bit {
543                    break;
544                }
545            }
546        }
547
548        while result < 0x100 {
549            result = (result << 1) ^ (rangecoder.decode_bit(&mut probs[result], update)? as usize);
550        }
551
552        Ok((result - 0x100) as u8)
553    }
554
555    fn decode_distance<'a, R: io::BufRead>(
556        &mut self,
557        rangecoder: &mut RangeDecoder<'a, R>,
558        length: usize,
559        update: bool,
560    ) -> error::Result<usize> {
561        let len_state = if length > 3 { 3 } else { length };
562
563        let pos_slot = self.pos_slot_decoder[len_state].parse(rangecoder, update)? as usize;
564        if pos_slot < 4 {
565            return Ok(pos_slot);
566        }
567
568        let num_direct_bits = (pos_slot >> 1) - 1;
569        let mut result = (2 ^ (pos_slot & 1)) << num_direct_bits;
570
571        if pos_slot < 14 {
572            result += rangecoder.parse_reverse_bit_tree(
573                num_direct_bits,
574                &mut self.pos_decoders,
575                result - pos_slot,
576                update,
577            )? as usize;
578        } else {
579            result += (rangecoder.get(num_direct_bits - 4)? as usize) << 4;
580            result += self.align_decoder.parse_reverse(rangecoder, update)? as usize;
581        }
582
583        Ok(result)
584    }
585}
586
587#[derive(Debug)]
588/// Raw decoder for LZMA.
589pub struct LzmaDecoder {
590    params: LzmaParams,
591    memlimit: usize,
592    state: DecoderState,
593}
594
595impl LzmaDecoder {
596    /// Creates a new object ready for decompressing data that it's given for the input
597    /// dict size, expected unpacked data size, and memory limit for the internal buffer.
598    pub fn new(params: LzmaParams, memlimit: Option<usize>) -> error::Result<LzmaDecoder> {
599        Ok(Self {
600            params,
601            memlimit: memlimit.unwrap_or(usize::MAX),
602            state: DecoderState::new(params.properties, params.unpacked_size),
603        })
604    }
605
606    /// Performs the equivalent of replacing this decompression state with a freshly allocated copy.
607    ///
608    /// Because the decoder state is reset, the unpacked size may optionally be re-specified. If `None`
609    /// is given, the previous unpacked size that the decoder was initialized with remains unchanged.
610    ///
611    /// This function may not allocate memory and will attempt to reuse any previously allocated resources.
612    #[cfg(feature = "raw_decoder")]
613    pub fn reset(&mut self, unpacked_size: Option<Option<u64>>) {
614        self.state.reset_state(self.params.properties);
615
616        if let Some(unpacked_size) = unpacked_size {
617            self.state.set_unpacked_size(unpacked_size);
618        }
619    }
620
621    /// Decompresses the input data into the output, consuming only as much input as needed and writing as much output as possible.
622    pub fn decompress<W: io::Write, R: io::BufRead>(
623        &mut self,
624        input: &mut R,
625        output: &mut W,
626    ) -> error::Result<()> {
627        let mut output =
628            LzCircularBuffer::from_stream(output, self.params.dict_size as usize, self.memlimit);
629
630        let mut rangecoder = RangeDecoder::new(input)
631            .map_err(|e| error::Error::LzmaError(format!("LZMA stream too short: {}", e)))?;
632        self.state.process(&mut output, &mut rangecoder)?;
633        output.finish()?;
634        Ok(())
635    }
636}