# If in your case the order of urls in `url.txt` is important, that is,
# `1.html` should contain the data of the first url, then `2.html`
# should corresponds to the second url and so on then you can process
# the urls one by one.
#
# The following script takes the desired action for each url:
#!/bin/bash
infile="$1"
dest_dir="~/Desktop/ProjectM2/data/crawl"
# create html and txt dir inside dest_dir
mkdir -p "$dest_dir"/{html,txt}
c=1
while IFS='' read -r url || [[ -n "$url" ]]; do
echo "Fetch $url into $c.html"
wget -q -O "$dest_dir"/html/$c.html "$url"
echo "Convert $c.html to $c.txt"
html2text -o "$dest_dir"/txt/$c.txt "$dest_dir"/html/$c.html
c=$(( c + 1 ))
done < "$infile"
# The script accounts for an input file, in this case `url.txt`. It
# creates two directories (`html`, `txt`) under your destination
# directory `~/Desktop/ProjectM2/data/crawl` in order to better organize
# the resulting files. We read the urls from the file `url.txt` line by
# line with the help of a while loop ([Read file line by line][1]). With
# `wget` you can specify the desired output filename with the `-O`
# option, thus you can name your file as you wish, in your case a
# sequence number. The `-q` option is used to remove wget messages from
# the command line. In `html2text` you can specify the outputfile using
# `-o`.
#
# [1]: https://stackoverflow.com/questions/10929453/read-a-file-line-
# by-line-assigning-the-value-to-a-variable
#
# [marcell] [so/q/47844231] [cc by-sa 3.0]
$
cheat.sh