cheat.sh/bash/��+��+cheat.sh+�+��+wget

$ curl cheat.sh/
# If in your case the order of urls in `url.txt` is important, that is,
# `1.html` should contain the data of the first url, then `2.html`
# should corresponds to the second url and so on then you can process
# the urls one by one.
# 
# The following script takes the desired action for each url:

 #!/bin/bash

 infile="$1"

 dest_dir="~/Desktop/ProjectM2/data/crawl"

 # create html and txt dir inside dest_dir
 mkdir -p "$dest_dir"/{html,txt}

 c=1
 while IFS='' read -r url || [[ -n "$url" ]]; do

     echo "Fetch $url into $c.html"
     wget -q -O "$dest_dir"/html/$c.html "$url"

     echo "Convert $c.html to $c.txt"
     html2text -o "$dest_dir"/txt/$c.txt "$dest_dir"/html/$c.html

     c=$(( c + 1 ))

 done < "$infile"

# The script accounts for an input file, in this case `url.txt`. It
# creates two directories (`html`, `txt`) under your destination
# directory `~/Desktop/ProjectM2/data/crawl` in order to better organize
# the resulting files. We read the urls from the file `url.txt` line by
# line with the help of a while loop ([Read file line by line][1]). With
# `wget` you can specify the desired output filename with the `-O`
# option, thus you can name your file as you wish, in your case a
# sequence number. The `-q` option is used to remove wget messages from
# the command line. In `html2text` you can specify the outputfile using
# `-o`.
# 
  # [1]: https://stackoverflow.com/questions/10929453/read-a-file-line-
# by-line-assigning-the-value-to-a-variable
# 
# [marcell] [so/q/47844231] [cc by-sa 3.0]

$
Follow @igor_chubin cheat.sh