Source code for pcapgraph.manipulate_frames

# -*- coding: utf-8 -*-
# Copyright 2018 Ross Jacobs All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Parse the frames from files based upon options.

Create the same JSON style with `tshark -r examples/simul1.pcap -T json -x`
Note that the <var>_raw is due to the -x flag.

::

    Frame JSON looks like this:
    {
        '_index': 'packets-2018-11-03',
        '_type': 'pcap_file',
        '_score': None,
        '_source': {
            'layers': {
                'frame_raw': ['881544abbfdd2477035113440800450000380b5d0000...
                'frame': {'frame.encap_type': '1', 'frame.time': 'Sep 26, 2...
                'eth_raw': ['881544abbfdd2477035113440800', 0, 14, 0, 1],
                'eth': {'eth.dst_raw': ['881544abbfdd', 0, 6, 0, 29], 'eth...
                'ip_raw': ['450000380b5d00004011c7980a3012900a808080', 14, 2...
                'ip': {'ip.version_raw': ['4', 14, 1, 240, 4], 'ip.version'...
                'udp_raw': ['ea6200350024a492', 34, 8, 0, 1],
                'udp': ['udp.srcport_raw': ['ea62', 34, 2, 0, 5], 'udp.srcp...
                'dns_raw': ['9b130100000100000000000006616d617a6f6e03636f6d...
                'dns': {'dns.id_raw': ['9b13', 42, 2, 0, 5], 'dns.id': '0x00...
            }
        }
    }

Many of these functions interact with this frame dict format or directly with
the frame string (seen in 'frame_raw'). The frame string is a string of the
hex of a packet.
"""

import subprocess as sp
import random
import json


[docs]def parse_pcaps(pcaps): """Given pcaps, return all frames and their timestamps. Args: pcaps (list): A list of pcap filenames Returns: pcap_json_list (list): All the packet data in json format. [{<pcap>: {PCAP JSON}}, ...] """ pcap_json_list = [] for pcap in pcaps: pcap_json_list.append(get_pcap_as_json(pcap)) return pcap_json_list
[docs]def get_flat_frame_dict(pcap_json_list): """Given the pcap json list, return the frame dict. Args: pcap_json_list (list): List of pcap dicts (see parse_pcaps for details) Returns: frame_list (dict): {<frame>: <timestamp>, ...} """ frame_dict = {} for pcap in pcap_json_list: for frame in pcap: frame_raw = get_frame_from_json(frame) frame_timestamp = \ frame['_source']['layers']['frame']['frame.time_epoch'] frame_dict[frame_raw] = frame_timestamp return frame_dict
[docs]def get_frame_list_by_pcap(pcap_json_dict): """Like get_flat_frame_dict, but with pcapname as key to each frame list Args: pcap_json_dict (dict): List of Pcap JSONs. Returns: (list): [[<frame>, ...], ...] """ pcap_frame_list = [] for pcap in pcap_json_dict.values(): pcap_frames = [] for frame in pcap: frame_str = get_frame_from_json(frame) pcap_frames.append(frame_str) pcap_frame_list.append(pcap_frames) return pcap_frame_list
[docs]def get_pcap_frame_dict(pcaps): """Like get_flat_frame_dict, but with pcapname as key to each frame list Args: pcaps (list): List of pcap file names. Returns: (dict): {<pcap>: {<frame>:<timestamp>, ...}, ...} """ pcap_frame_list = {} for pcap in pcaps: pcap_json_list = parse_pcaps([pcap]) pcap_frame_list[pcap] = get_flat_frame_dict(pcap_json_list) return pcap_frame_list
[docs]def get_frame_from_json(frame): """Get/sanitize raw frame from JSON of frame from `tshark -x -T json ...` Args: frame (dict): A dict of a single packet from tshark. Returns: (str): The ASCII hexdump value of a packet """ if not isinstance(frame, dict): print('frame is type', type(frame)) raise TypeError("Frame must be dict!\n" + str(frame)[:120] + '...') frame_raw = frame['_source']['layers']['frame_raw'] # Sometimes we get a list including the frame str instead of the frame str. if isinstance(frame_raw, list): frame_raw = frame_raw[0] return frame_raw
[docs]def get_pcap_as_json(pcap): """Given a pcap, return a json with `tshark -r <file> -x -T json`. tshark -r <pcap> -w - Pipes packet capture one packet per line to stdout tshark -r - Read file from stdin tshark -r <in.pcap> -x | text2pcap - <out.pcap> Prints hex of pcap to stdout and then resaves it as a pcap. This WILL delete packet timestamps as that is not encoded in hex output. Args: pcap (string): File name. Returns: (list): List of the pcap json provided by tshark. """ if not isinstance(pcap, str): raise TypeError("Filename must be string!\n" + str(pcap)[:120] + '...') get_json_cmds = ['tshark', '-r', pcap, '-x', '-T', 'json'] pcap_json_pipe = sp.Popen(get_json_cmds, stdout=sp.PIPE) pcap_json_raw = pcap_json_pipe.communicate()[0] pcap_json_pipe.kill() pcap_json_list = [] if pcap_json_raw: # Don't want json.loads to crash due to an empty string. pcap_json_list = json.loads(pcap_json_raw) return pcap_json_list
[docs]def strip_layers(filenames, options): """Get the PCAP JSON dict stripped per options. strip-l3: Replace layer 3 fields src/dst IP, ttl, checksum with dummy values strip-l2: Remove all layer 2 fields like FCS, source/dest MAC, VLAN tag... Args: filenames (list): List of filenames. options (dict): Whether to strip L2 and L3 headers. Returns: (dict): The modified packet dict """ pcap_json_dict = {} for file in filenames: pcap_json = parse_pcaps([file])[0] if options['strip-l3']: for index, packet in enumerate(pcap_json): ip_raw = packet['_source']['layers']['ip_raw'] frame_raw = packet['_source']['layers']['frame_raw'] # Sometimes, these values will be a list instead of a string. if isinstance(ip_raw, list): ip_raw = ip_raw[0] if isinstance(frame_raw, list): frame_raw = frame_raw[0] homogenized_packet = get_homogenized_packet(ip_raw) pcap_json[index]['_source']['layers']['frame_raw'] = \ homogenized_packet + frame_raw.split(ip_raw)[1] elif options['strip-l2']: for index, packet in enumerate(pcap_json): eth_raw = packet['_source']['layers']['eth_raw'] if isinstance(eth_raw, list): eth_raw = eth_raw[0] # Correct to string if list eth_len = len(eth_raw) frame_raw = packet['_source']['layers']['frame_raw'] if isinstance(frame_raw, list): frame_raw = frame_raw[0] # Correct to string if list pcap_json[index]['_source']['layers']['frame_raw'] = \ frame_raw[eth_len:] pcap_json_dict[file] = pcap_json return pcap_json_dict
[docs]def get_homogenized_packet(ip_raw): """Change an IPw4 packet's fields to the same, homogenized values. Replace TTL, header checksum, and IP src/dst with generic values. This function is designed to replace all IP data that would change on a layer 3 boundary Note that these options are found only in IPv4. TTL is expected to change at every hop along with header checksum. IPs are expected to change for NAT. Args: ip_raw (str): ASCII hex of packet. Returns: (str): Packet with fields that would be altered by l3 boundary replaced """ ttl = 'ff' ip_proto = ip_raw[18:20] ip_header_checksum = '1337' src_ip = '0a010101' dst_ip = '0a020202' homogenized_packet = ip_raw[:16] + ttl + ip_proto + \ ip_header_checksum + src_ip + dst_ip + ip_raw[40:] return homogenized_packet
[docs]def anonymous_pcap_names(num_names): """Anonymize pcap names. Creation of funny pcap names like `switch_wireless` is intendeded behavior. Args: num_names (int): Number of names to be returned Returns: (list): Fake pcap name list """ fake_city_names = [ 'Hogwarts', 'Quahog', 'Lake Wobegon', 'Narnia', 'Ankh-Morpork', 'Gotham City', 'Asgard', 'Neverland', 'The Shire', 'Rivendell', 'Diagon Alley', 'King\'s Landing', 'Cooper Station', 'Dragonstone', 'El Dorado', 'Atlantis', 'Pallet Town', 'Shangri-La', 'Mos Eisley' ] fake_device_names = [ 'firewall', 'router', 'access point', 'switch', 'bridge', 'repeater', 'dial-up modem', 'proxy server', 'hub', 'tokenring mau', 'gateway', 'turbo encabulator', 'L3 switch', 'HIDS', 'load balancer', 'packet shaper', 'vpn concentrator', 'content filter', 'CSU/DSU' ] fake_names = [] for _ in range(num_names): fake_place = random.choice(fake_city_names) fake_device = random.choice(fake_device_names) fake_name = fake_place + '-' + fake_device fake_names.append(fake_name) return fake_names
[docs]def decode_stdout(stdout): """Given stdout, return the string.""" return stdout.communicate()[0].decode('utf-8').strip()
[docs]def get_packet_count(filename): """Given a file, get the packet count. Args: filename (str): Path of a file, including extension Returns: packet_count (int): How many packets were in that pcap """ packet_count_cmds = ['-r', filename, '-2'] pcap_text_pipe = sp.Popen(['tshark', *packet_count_cmds], stdout=sp.PIPE, stderr=sp.PIPE) pcap_text = decode_stdout(pcap_text_pipe) pcap_text_pipe.kill() # Split text like so in order that we capture 1-line text with no newline packet_list = pcap_text.split('\n') # Filter out any packets that are the empty string packet_count = len(list(filter(None, packet_list))) return packet_count