rssread.sh revision 3e14f97f673e8a630f076077de35afdd43dc1587
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
#
#
# rssread - a simple RSS2.0 reader with RSS to XHTML to
# plaintext conversion.
#
function printmsg
{
print -u2 "$*"
}
function debugmsg
{
# printmsg "$*"
true
}
function fatal_error
{
print -u2 "${progname}: $*"
exit 1
}
typeset -T urlconnection_t=(
# public
typeset user_agent="ksh93/urlconnection_t"
# private variables
typeset protocol
typeset path1
typeset host
typeset path
typeset port
compound netfd=(
integer in=-1 # incoming traffic
integer out=-1 # outgoing traffic
)
# only used for https
compound ssl=(
compound fifo=(
typeset dir=""
typeset in=""
typeset out=""
)
integer openssl_client_pid=-1
)
# parse HTTP return code, cookies etc.
function parse_http_response
{
nameref response="$1"
typeset h statuscode statusmsg i
# we use '\r' as additional IFS to filter the final '\r'
[[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n" "$0" ; return 1 ; }
# skip remaining headers
[[ "$i" == $'\r' ]] && break
# strip '\r' at the end
i="${i/~(Er)$'\r'/}"
response.content_type="${i/~(El).*:[[:blank:]]*/}"
;;
integer response.content_length="${i/~(El).*:[[:blank:]]*/}"
;;
response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}"
;;
esac
done
return 0
}
function cat_http_body
{
typeset emode="$1"
typeset hexchunksize="0"
integer chunksize=0
if [[ "${emode}" == "chunked" ]] ; then
hexchunksize="${hexchunksize//$'\r'/}"
[[ "${hexchunksize}" != "" ]] || continue
done
else
fi
return 0
}
function init_url
{
_.protocol="${1%://*}"
fi
return 0
}
# close connection
function close_connection
{
integer ret
redirect {_.netfd.in}<&-
fi
redirect {_.netfd.out}<&-
fi
rm -r \"${_.ssl.fifo.dir}\"
fi
return 0
}
function open_connection
{
_.ssl.fifo.in="${_.ssl.fifo.dir}/in"
_.ssl.fifo.out="${_.ssl.fifo.dir}/out"
# Use "errexit" to leave it at the first error
set -o errexit
mkfifo "${_.ssl.fifo.in}" "${_.ssl.fifo.out}"
# create async openssl child to handle https
else
redirect {_.netfd.in}<> "/dev/tcp/${_.host}/${_.port}"
fi
return 0
}
function send_request
{
typeset request="$1"
set -o errexit
redirect {_.netfd.in}< "${_.ssl.fifo.out}"
else
fi
return 0
}
function cat_url
{
return $?
compound httpresponse # http response
# If URL did not contain a port number in the host part then look at the
# protocol to get the port number
esac
else
fi
printmsg "protocol=${_.protocol} port=${_.port} host=${_.host} path=${_.path}"
# prechecks
_.open_connection || return 1
# send HTTP request
request+="Connection: close\r\n"
# collect response and send it to stdout
{
_.cat_http_body "${httpresponse.transfer_encoding}"
} <&${_.netfd.in}
return 0
else
return 1
fi
# notreached
}
)
function html_entity_to_ascii
{
typeset buf
typeset entity
typeset c
typeset value
# Note we use a static variable (typeset -S) here to make sure we
# don't loose the cache data between calls
typeset -S -A entity_cache=(
# entity to ascii (fixme: add UTF-8 transliterations)
["nbsp"]=' '
["lt"]='<'
["le"]='<='
["gt"]='>'
["ge"]='>='
["amp"]='&'
["quot"]='"'
["apos"]="'"
)
buf=""
if [[ "$c" != "&" ]] ; then
print -n -r -- "${c}"
continue
fi
entity=""
";")
break
;;
entity+="$c"
continue
;;
*)
# debugmsg "error &${entity}${c}#"
print -n -r -- "${entity}${c}"
entity=""
continue 2
;;
esac
done
value=""
if [[ "${entity_cache["${entity}"]}" != "" ]] ; then
# debugmsg "match #${entity}# = #${entity_cache["${entity}"]}#"
else
if [[ "${entity:0:1}" == "#" ]] ; then
# decimal literal
# hexadecimal literal
else
# unknown literal - pass-through
value="ENT=|${entity}|"
fi
# debugmsg "lookup #${entity}# = #${entity_cache["${entity}"]}#"
fi
printf "%s" "${value}"
done
return 0
}
# dumb xhtml handler - no CSS, tables, images, iframes or nested
# structures are supported (and we assume that the input is correct
# xhtml). The code was written in a trial&&error manner and should be
# rewritten to parse xhtml correctly.
function handle_html
{
# we can't use global variables here when multiple callbacks use the same
# callback function - but we can use the callback associative array for
# variable storage instead
nameref callbacks=${1}
typeset tag_type="$2"
typeset tag_value="$3"
esac
;;
esac
;;
printf "%s" "${tag_value}"
else
printf "%s" "${tag_value//+([\n\r\t\v[:space:][:blank:]])/ }"
fi
;;
;;
document_end) ;;
esac
return 0
}
function handle_rss
{
# we can't use global variables here when multiple callbacks use the same
# callback function - but we can use the callback associative array for
# variable storage instead
nameref callbacks=${1}
typeset tag_type="$2"
typeset tag_value="$3"
item)
;;
esac
;;
item)
# note that each RSS item needs to be converted seperately from RSS to HTML to plain text
# to make sure that the state of one RSS item doesn't affect others
(
printf $"<br />## begin description:"
printf $"<br />## end description<br />"
print # extra newline to make sure the sed pipeline gets flushed
) |
html_entity_to_ascii | # convert XML entities (e.g. decode RSS content to HTML code)
html_entity_to_ascii # convert HTML entities
;;
esac
;;
;;
document_start) ;;
document_end) ;;
esac
return 0
}
function xml_tok
{
typeset buf=""
typeset namebuf=""
typeset attrbuf=""
typeset c=""
typeset issingletag # bool: true/false (used for tags like "<br />")
nameref callbacks=${1}
[[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start"
isendtag=false
if [[ "$c" == "<" ]] ; then
# flush any text content
buf=""
fi
if [[ "$c" == "/" ]] ; then
isendtag=true
else
buf="$c"
fi
buf+="$c"
# handle comments
# did we read the comment completely ?
buf+=">"
buf+="$c"
done
fi
[[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}"
buf=""
continue
fi
# check if the tag starts and ends at the same time (like "<br />")
issingletag=true
buf="${buf%*/}"
else
issingletag=false
fi
# check if the tag has attributes (e.g. space after name)
namebuf="${buf%%~(E)[[:space:][:blank:]].*}"
attrbuf="${buf#~(E).*[[:space:][:blank:]]}"
else
attrbuf=""
fi
if ${isendtag} ; then
else
[[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf"
# handle tags like <br/> (which are start- and end-tag in one piece)
if ${issingletag} ; then
fi
fi
buf=""
else
buf+="$c"
fi
done
[[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success"
print # final newline to make filters like "sed" happy
}
# return the value of LC_MESSAGES needed for subprocesses which
function get_lc_messages
{
[[ "${LC_ALL}" != "" ]] && { print "${LC_ALL}" ; return 0 ; }
[[ "${LC_MESSAGES}" != "" ]] && { print "${LC_MESSAGES}" ; return 0 ; }
[[ "${LANG}" != "" ]] && { print "${LANG}" ; return 0 ; }
print "C" ; return 0
}
function do_rssread
{
# set unicode locale since RSS is encoded in UTF-8
# (and make sure $LC_MESSAGES is set to the parent
# process's locale that all error messages are using
export \
# return non-zero exit code for this function if the rss processing below fails
set -o errexit
# need extra newline after cat_url to terminate line with $'\n'
# to make "xml_tok" happy
return 0
}
function usage
{
OPTIND=0
exit 2
}
# make sure we use the ksh93 builtin versions
builtin basename
builtin cat
builtin mkfifo
typeset -A rsstok_cb # callbacks for xml_tok
typeset -A xhtmltok_cb # callbacks for xml_tok
typeset -A item
typeset -A bookmark_urls
# "ramdom" urls for testing
["google_blogs_ksh"]="http://blogsearch.google.com/blogsearch_feeds?hl=en&scoring=d&q=(%22ksh93%22%7C%22ksh+93%22+%7C+%22korn93%22+%7C+%22korn+93%22)&ie=utf-8&num=100&output=rss"
["wikipedia_command_shells"]="http://en.wikipedia.org/w/index.php?title=Comparison_of_command_shells&feed=rss&action=history"
)
typeset -r rssread_usage=$'+
[-?\n@(#)\$Id: rssread (Roland Mainz) 2010-03-27 \$\n]
[-author?Roland Mainz <roland.mainz@sun.com>]
[-author?Roland Mainz <roland.mainz@nrubsig.org>]
[+NAME?rssread - fetch RSS messages and convert them to plain text]
[+DESCRIPTION?\brssread\b RSS to plain text converter
which fetches RSS streams via HTTP and converts them from
[ url ]
[+SEE ALSO?\bksh93\b(1), \bshnote\b(1)]
'
typeset noiconv=false
while getopts -a "${progname}" "${rssread_usage}" OPT ; do
# printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
I) noiconv=true ;;
+I) noiconv=false ;;
*) usage ;;
esac
done
typeset url="$1"
if [[ "${url}" == "" ]] ; then
fatal_error $"No url given."
fi
if [[ "${bookmark_urls[${url}]}" != "" ]] ; then
printmsg $"Using bookmark ${url} = ${bookmark_urls[${url}]}"
url="${bookmark_urls[${url}]}"
fi
if ${noiconv} ; then
do_rssread "${url}"
else
fi
exit 0
#EOF.