AI For Trading: text processing 基本处理方法 (82)

Text Processing

Capturing Text Data

Plain Text

import os

# Read in a plain text file
with open(os.path.join("data", "hieroglyph.txt"), "r") as f:
    text = f.read()
    print(text)
Hieroglyphic writing dates from c. 3000 BC, and is composed of hundreds of symbols. A hieroglyph can represent a word, a sound, or a silent determinative; and the same symbol can serve different purposes in different contexts. Hieroglyphs were a formal script, used on stone monuments and in tombs, that could be as detailed as individual works of art.

Tabular Data

import pandas as pd

# Extract text column from a dataframe
df = pd.read_csv(os.path.join("data", "news.csv"))
df.head()[['publisher', 'title']]

# Convert text column to lowercase
df['title'] = df['title'].str.lower()
df.head()[['publisher', 'title']]
publisher title
0 Livemint fed's charles plosser sees high bar for change...
1 IFA Magazine us open: stocks fall after fed official hints ...
2 IFA Magazine fed risks falling 'behind the curve', charles ...
3 Moneynews fed's plosser: nasty weather has curbed job gr...
4 NASDAQ plosser: fed may have to accelerate tapering pace

Online Resource

import requests
import json

# Fetch data from a REST API
r = requests.get(
    "https://quotes.rest/qod.json")
res = r.json()
print(json.dumps(res, indent=4))

# Extract relevant object and field
q = res["contents"]["quotes"][0]
print(q["quote"], "\n--", q["author"])
{
    "success": {
        "total": 1
    },
    "contents": {
        "quotes": [
            {
                "quote": "I wanted to achieve something essential in life, something that is not measured by money or position in society. The mountains are not stadiums where I satisfy my ambitions to achieve. They are my cathedrals, the houses of my religion. In the mountains I attempt to understand my life. They are the way I practice my religion. In the mountains I celebrate creation, on each journey I am reborn.",
                "author": "Anatoli Boukreev",
                "length": "394",
                "tags": [
                    "achievement",
                    "ambition",
                    "inspire",
                    "journey",
                    "mountaineering",
                    "tso-life"
                ],
                "category": "inspire",
                "title": "Inspiring Quote of the day",
                "date": "2019-06-09",
                "id": null
            }
        ],
        "copyright": "2017-19 theysaidso.com"
    }
}
I wanted to achieve something essential in life, something that is not measured by money or position in society. The mountains are not stadiums where I satisfy my ambitions to achieve. They are my cathedrals, the houses of my religion. In the mountains I attempt to understand my life. They are the way I practice my religion. In the mountains I celebrate creation, on each journey I am reborn. 
-- Anatoli Boukreev

Cleaning

import requests

# Fetch a web page
r = requests.get("https://news.ycombinator.com")
print(r.text)
    <html op="news"><head><meta name="referrer" content="origin"><meta name="viewport" content="width=device-width, initial-scale=1.0"><link rel="stylesheet" type="text/css" href="news.css?1UVKBKKrr3Dybiy50LkU">
                <link rel="shortcut icon" href="favicon.ico">
              <link rel="alternate" type="application/rss+xml" title="RSS" href="rss">
            <title>Hacker News</title></head><body><center><table id="hnmain" border="0" cellpadding="0" cellspacing="0" width="85%" bgcolor="#f6f6ef">
            <tr><td bgcolor="#ff6600"><table border="0" cellpadding="0" cellspacing="0" width="100%" style="padding:2px"><tr><td style="width:18px;padding-right:4px"><a href="https://news.ycombinator.com"><img src="y18.gif" width="18" height="18" style="border:1px white solid;"></a></td>
                      <td style="line-height:12pt; height:10px;"><span class="pagetop"><b class="hnname"><a href="news">Hacker News</a></b>
                  <a href="newest">new</a> | <a href="front">past</a> | <a href="newcomments">comments</a> | <a href="ask">ask</a> | <a href="show">show</a> | <a href="jobs">jobs</a> | <a href="submit">submit</a>            </span></td><td style="text-align:right;padding-right:4px;"><span class="pagetop">
                                  <a href="login?goto=news">login</a>
                              </span></td>
                  </tr></table></td></tr>
    <tr id="pagespace" title="" style="height:10px"></tr><tr><td><table border="0" cellpadding="0" cellspacing="0" class="itemlist">
                  <tr class='athing' id='20135938'>
          <td align="right" valign="top" class="title"><span class="rank">1.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20135938' href='vote?id=20135938&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://www.npr.org/2019/06/08/730898429/huawei-accused-of-technology-theft" class="storylink">Huawei Accused of Technology Theft</a><span class="sitebit comhead"> (<a href="from?site=npr.org"><span class="sitestr">npr.org</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20135938">85 points</span> by <a href="user?id=tooltalk" class="hnuser">tooltalk</a> <span class="age"><a href="item?id=20135938">1 hour ago</a></span> <span id="unv_20135938"></span> | <a href="hide?id=20135938&goto=news">hide</a> | <a href="item?id=20135938">20 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20135318'>
          <td align="right" valign="top" class="title"><span class="rank">2.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20135318' href='vote?id=20135318&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://apnews.com/ff973c5c769c4d1d872255f04c6c1f11" class="storylink">Gaudí’s Sagrada Familia gets building permit after 137 years</a><span class="sitebit comhead"> (<a href="from?site=apnews.com"><span class="sitestr">apnews.com</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20135318">98 points</span> by <a href="user?id=antigizmo" class="hnuser">antigizmo</a> <span class="age"><a href="item?id=20135318">4 hours ago</a></span> <span id="unv_20135318"></span> | <a href="hide?id=20135318&goto=news">hide</a> | <a href="item?id=20135318">21 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20135726'>
          <td align="right" valign="top" class="title"><span class="rank">3.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20135726' href='vote?id=20135726&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://ai.googleblog.com/2019/06/introducing-google-research-football.html" class="storylink">Google Research Football: A Novel Reinforcement Learning Environment</a><span class="sitebit comhead"> (<a href="from?site=googleblog.com"><span class="sitestr">googleblog.com</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20135726">32 points</span> by <a href="user?id=haditab" class="hnuser">haditab</a> <span class="age"><a href="item?id=20135726">2 hours ago</a></span> <span id="unv_20135726"></span> | <a href="hide?id=20135726&goto=news">hide</a> | <a href="item?id=20135726">9 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20129942'>
          <td align="right" valign="top" class="title"><span class="rank">4.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20129942' href='vote?id=20129942&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://www.wired.com/story/apple-find-my-cryptography-bluetooth/" class="storylink">The clever cryptography behind Apple’s “Find My” feature</a><span class="sitebit comhead"> (<a href="from?site=wired.com"><span class="sitestr">wired.com</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20129942">150 points</span> by <a href="user?id=nnx" class="hnuser">nnx</a> <span class="age"><a href="item?id=20129942">6 hours ago</a></span> <span id="unv_20129942"></span> | <a href="hide?id=20129942&goto=news">hide</a> | <a href="item?id=20129942">51 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20132520'>
          <td align="right" valign="top" class="title"><span class="rank">5.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20132520' href='vote?id=20132520&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://ai.google/research/pubs/pub48190" class="storylink">Zanzibar: Consistent, Global Authorization System</a><span class="sitebit comhead"> (<a href="from?site=ai.google"><span class="sitestr">ai.google</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20132520">280 points</span> by <a href="user?id=acjohnson55" class="hnuser">acjohnson55</a> <span class="age"><a href="item?id=20132520">11 hours ago</a></span> <span id="unv_20132520"></span> | <a href="hide?id=20132520&goto=news">hide</a> | <a href="item?id=20132520">94 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20135877'>
          <td align="right" valign="top" class="title"><span class="rank">6.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20135877' href='vote?id=20135877&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://9to5mac.com/2019/06/08/ios-13-location-permissions/" class="storylink">iOS 13 now shows you a map of where apps have been tracking you</a><span class="sitebit comhead"> (<a href="from?site=9to5mac.com"><span class="sitestr">9to5mac.com</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20135877">92 points</span> by <a href="user?id=notlukesky" class="hnuser">notlukesky</a> <span class="age"><a href="item?id=20135877">2 hours ago</a></span> <span id="unv_20135877"></span> | <a href="hide?id=20135877&goto=news">hide</a> | <a href="item?id=20135877">16 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20133187'>
          <td align="right" valign="top" class="title"><span class="rank">7.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20133187' href='vote?id=20133187&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://www.nytimes.com/2019/06/07/business/economy/age-discrimination-jobs-hiring.html" class="storylink">New Evidence of Age Bias in Hiring, and a Push to Fight It</a><span class="sitebit comhead"> (<a href="from?site=nytimes.com"><span class="sitestr">nytimes.com</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20133187">271 points</span> by <a href="user?id=howard941" class="hnuser">howard941</a> <span class="age"><a href="item?id=20133187">8 hours ago</a></span> <span id="unv_20133187"></span> | <a href="hide?id=20133187&goto=news">hide</a> | <a href="item?id=20133187">240 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20133620'>
          <td align="right" valign="top" class="title"><span class="rank">8.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20133620' href='vote?id=20133620&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="http://news.mit.edu/2019/ai-chip-light-computing-faster-0605" class="storylink">Chip design drastically reduces energy needed to compute with light</a><span class="sitebit comhead"> (<a href="from?site=mit.edu"><span class="sitestr">mit.edu</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20133620">95 points</span> by <a href="user?id=rbanffy" class="hnuser">rbanffy</a> <span class="age"><a href="item?id=20133620">8 hours ago</a></span> <span id="unv_20133620"></span> | <a href="hide?id=20133620&goto=news">hide</a> | <a href="item?id=20133620">16 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20133844'>
          <td align="right" valign="top" class="title"><span class="rank">9.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20133844' href='vote?id=20133844&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://grover.allenai.org/" class="storylink">Grover – A State-of-the-Art Defense Against Neural Fake News</a><span class="sitebit comhead"> (<a href="from?site=allenai.org"><span class="sitestr">allenai.org</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20133844">94 points</span> by <a href="user?id=Qworg" class="hnuser">Qworg</a> <span class="age"><a href="item?id=20133844">7 hours ago</a></span> <span id="unv_20133844"></span> | <a href="hide?id=20133844&goto=news">hide</a> | <a href="item?id=20133844">52 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20136223'>
          <td align="right" valign="top" class="title"><span class="rank">10.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20136223' href='vote?id=20136223&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="http://www.engadget.com/2019/06/08/tesla-model-3-software-downgrade/" class="storylink">Tesla will soon downgrade software on the entry-level Model 3</a><span class="sitebit comhead"> (<a href="from?site=engadget.com"><span class="sitestr">engadget.com</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20136223">18 points</span> by <a href="user?id=mbreese" class="hnuser">mbreese</a> <span class="age"><a href="item?id=20136223">45 minutes ago</a></span> <span id="unv_20136223"></span> | <a href="hide?id=20136223&goto=news">hide</a> | <a href="item?id=20136223">2 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20135047'>
          <td align="right" valign="top" class="title"><span class="rank">11.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20135047' href='vote?id=20135047&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://practicejs.com/" class="storylink">PracticeJS – Coding exercises in a mobile friendly web app</a><span class="sitebit comhead"> (<a href="from?site=practicejs.com"><span class="sitestr">practicejs.com</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20135047">36 points</span> by <a href="user?id=rem_one" class="hnuser">rem_one</a> <span class="age"><a href="item?id=20135047">4 hours ago</a></span> <span id="unv_20135047"></span> | <a href="hide?id=20135047&goto=news">hide</a> | <a href="item?id=20135047">9 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20132888'>
          <td align="right" valign="top" class="title"><span class="rank">12.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20132888' href='vote?id=20132888&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://github.blog/2019-06-06-generate-new-repositories-with-repository-templates/" class="storylink">Generate new repositories with repository templates</a><span class="sitebit comhead"> (<a href="from?site=github.blog"><span class="sitestr">github.blog</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20132888">117 points</span> by <a href="user?id=mxstbr" class="hnuser">mxstbr</a> <span class="age"><a href="item?id=20132888">10 hours ago</a></span> <span id="unv_20132888"></span> | <a href="hide?id=20132888&goto=news">hide</a> | <a href="item?id=20132888">20 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20134021'>
          <td align="right" valign="top" class="title"><span class="rank">13.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20134021' href='vote?id=20134021&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://www.atlasobscura.com/articles/raines-sandwich" class="storylink">To Evade Pre-Prohibition Drinking Laws, New Yorkers Created an Inedible Sandwich</a><span class="sitebit comhead"> (<a href="from?site=atlasobscura.com"><span class="sitestr">atlasobscura.com</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20134021">51 points</span> by <a href="user?id=camtarn" class="hnuser">camtarn</a> <span class="age"><a href="item?id=20134021">7 hours ago</a></span> <span id="unv_20134021"></span> | <a href="hide?id=20134021&goto=news">hide</a> | <a href="item?id=20134021">10 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20131852'>
          <td align="right" valign="top" class="title"><span class="rank">14.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20131852' href='vote?id=20131852&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://www.sciencenews.org/article/gut-bacteria-may-change-way-many-drugs-work-body" class="storylink">Gut bacteria may change the way many drugs work in the body</a><span class="sitebit comhead"> (<a href="from?site=sciencenews.org"><span class="sitestr">sciencenews.org</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20131852">147 points</span> by <a href="user?id=idl3Y" class="hnuser">idl3Y</a> <span class="age"><a href="item?id=20131852">14 hours ago</a></span> <span id="unv_20131852"></span> | <a href="hide?id=20131852&goto=news">hide</a> | <a href="item?id=20131852">32 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20133151'>
          <td align="right" valign="top" class="title"><span class="rank">15.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20133151' href='vote?id=20133151&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://www.eff.org/deeplinks/2019/06/adversarial-interoperability-reviving-elegant-weapon-more-civilized-age-slay" class="storylink">Adversarial interoperability: reviving an old weapon to slay today's monopolies</a><span class="sitebit comhead"> (<a href="from?site=eff.org"><span class="sitestr">eff.org</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20133151">78 points</span> by <a href="user?id=liotier" class="hnuser">liotier</a> <span class="age"><a href="item?id=20133151">9 hours ago</a></span> <span id="unv_20133151"></span> | <a href="hide?id=20133151&goto=news">hide</a> | <a href="item?id=20133151">11 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20135466'>
          <td align="right" valign="top" class="title"><span class="rank">16.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20135466' href='vote?id=20135466&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://arstechnica.com/science/2019/06/creative-thinking-researchers-propose-solar-methanol-island-using-ocean-co%E2%82%82/" class="storylink">Researchers propose solar methanol island using ocean CO₂</a><span class="sitebit comhead"> (<a href="from?site=arstechnica.com"><span class="sitestr">arstechnica.com</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20135466">11 points</span> by <a href="user?id=DoreenMichele" class="hnuser">DoreenMichele</a> <span class="age"><a href="item?id=20135466">3 hours ago</a></span> <span id="unv_20135466"></span> | <a href="hide?id=20135466&goto=news">hide</a> | <a href="item?id=20135466">5 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20131710'>
          <td align="right" valign="top" class="title"><span class="rank">17.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20131710' href='vote?id=20131710&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://www.nature.com/articles/d41586-019-01780-9" class="storylink">The human body is a mosaic of different genomes</a><span class="sitebit comhead"> (<a href="from?site=nature.com"><span class="sitestr">nature.com</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20131710">97 points</span> by <a href="user?id=pseudolus" class="hnuser">pseudolus</a> <span class="age"><a href="item?id=20131710">14 hours ago</a></span> <span id="unv_20131710"></span> | <a href="hide?id=20131710&goto=news">hide</a> | <a href="item?id=20131710">11 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20132889'>
          <td align="right" valign="top" class="title"><span class="rank">18.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20132889' href='vote?id=20132889&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://colorkitty.com/" class="storylink">ColorKitty: Find palettes from pictures</a><span class="sitebit comhead"> (<a href="from?site=colorkitty.com"><span class="sitestr">colorkitty.com</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20132889">61 points</span> by <a href="user?id=based2" class="hnuser">based2</a> <span class="age"><a href="item?id=20132889">10 hours ago</a></span> <span id="unv_20132889"></span> | <a href="hide?id=20132889&goto=news">hide</a> | <a href="item?id=20132889">15 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20135910'>
          <td align="right" valign="top" class="title"><span class="rank">19.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20135910' href='vote?id=20135910&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://techcrunch.com/2019/06/08/former-unity-technology-vp-files-lawsuit-alleging-ceo-sexually-harassed-her/" class="storylink">Former Unity Technology VP files lawsuit alleging CEO sexually harassed her</a><span class="sitebit comhead"> (<a href="from?site=techcrunch.com"><span class="sitestr">techcrunch.com</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20135910">15 points</span> by <a href="user?id=mikkelewis" class="hnuser">mikkelewis</a> <span class="age"><a href="item?id=20135910">2 hours ago</a></span> <span id="unv_20135910"></span> | <a href="hide?id=20135910&goto=news">hide</a> | <a href="item?id=20135910">1 comment</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20131672'>
          <td align="right" valign="top" class="title"><span class="rank">20.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20131672' href='vote?id=20131672&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="http://nautil.us/issue/73/play/why-it-pays-to-play-around" class="storylink">Nature invented play long before it invented us</a><span class="sitebit comhead"> (<a href="from?site=nautil.us"><span class="sitestr">nautil.us</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20131672">92 points</span> by <a href="user?id=dnetesn" class="hnuser">dnetesn</a> <span class="age"><a href="item?id=20131672">15 hours ago</a></span> <span id="unv_20131672"></span> | <a href="hide?id=20131672&goto=news">hide</a> | <a href="item?id=20131672">2 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20133308'>
          <td align="right" valign="top" class="title"><span class="rank">21.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20133308' href='vote?id=20133308&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://www.weforum.org/agenda/2019/01/2-davos-experts-says-it-s-time-to-switch-to-a-four-day-working-week/" class="storylink">It's time to switch to a four-day working week, say two experts</a><span class="sitebit comhead"> (<a href="from?site=weforum.org"><span class="sitestr">weforum.org</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20133308">195 points</span> by <a href="user?id=joeyespo" class="hnuser">joeyespo</a> <span class="age"><a href="item?id=20133308">8 hours ago</a></span> <span id="unv_20133308"></span> | <a href="hide?id=20133308&goto=news">hide</a> | <a href="item?id=20133308">103 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20111178'>
          <td align="right" valign="top" class="title"><span class="rank">22.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20111178' href='vote?id=20111178&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://blog.npmjs.org/post/185397814280/plot-to-steal-cryptocurrency-foiled-by-the-npm" class="storylink">Plot to steal cryptocurrency foiled by NPM</a><span class="sitebit comhead"> (<a href="from?site=npmjs.org"><span class="sitestr">npmjs.org</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20111178">73 points</span> by <a href="user?id=soheilpro" class="hnuser">soheilpro</a> <span class="age"><a href="item?id=20111178">5 hours ago</a></span> <span id="unv_20111178"></span> | <a href="hide?id=20111178&goto=news">hide</a> | <a href="item?id=20111178">49 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20132190'>
          <td align="right" valign="top" class="title"><span class="rank">23.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20132190' href='vote?id=20132190&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://www.bbc.com/news/blogs-china-blog-48552907" class="storylink">WeChat and the Surveillance State</a><span class="sitebit comhead"> (<a href="from?site=bbc.com"><span class="sitestr">bbc.com</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20132190">348 points</span> by <a href="user?id=Markoff" class="hnuser">Markoff</a> <span class="age"><a href="item?id=20132190">12 hours ago</a></span> <span id="unv_20132190"></span> | <a href="hide?id=20132190&goto=news">hide</a> | <a href="item?id=20132190">176 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20133817'>
          <td align="right" valign="top" class="title"><span class="rank">24.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20133817' href='vote?id=20133817&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://bradleytaunt.com/2019/06/08/html-like-1999/" class="storylink">Write HTML Like It's 1999</a><span class="sitebit comhead"> (<a href="from?site=bradleytaunt.com"><span class="sitestr">bradleytaunt.com</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20133817">182 points</span> by <a href="user?id=bradley_taunt" class="hnuser">bradley_taunt</a> <span class="age"><a href="item?id=20133817">7 hours ago</a></span> <span id="unv_20133817"></span> | <a href="hide?id=20133817&goto=news">hide</a> | <a href="item?id=20133817">134 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20133972'>
          <td align="right" valign="top" class="title"><span class="rank">25.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20133972' href='vote?id=20133972&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://www.nytimes.com/2019/06/06/magazine/mike-gravel-teens-twitter-presidential-campaign.html" class="storylink">Teenagers campaigning for Mike Gravel on Twitter</a><span class="sitebit comhead"> (<a href="from?site=nytimes.com"><span class="sitestr">nytimes.com</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20133972">78 points</span> by <a href="user?id=ordiblah" class="hnuser">ordiblah</a> <span class="age"><a href="item?id=20133972">7 hours ago</a></span> <span id="unv_20133972"></span> | <a href="hide?id=20133972&goto=news">hide</a> | <a href="item?id=20133972">35 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20131692'>
          <td align="right" valign="top" class="title"><span class="rank">26.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20131692' href='vote?id=20131692&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://www.latimes.com/local/lanow/la-me-ln-virulent-newcastle-disease-outbreak-in-southern-california-20190607-story.html" class="storylink">To stop a virus, California has euthanized more than 1.2M birds</a><span class="sitebit comhead"> (<a href="from?site=latimes.com"><span class="sitestr">latimes.com</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20131692">84 points</span> by <a href="user?id=pseudolus" class="hnuser">pseudolus</a> <span class="age"><a href="item?id=20131692">15 hours ago</a></span> <span id="unv_20131692"></span> | <a href="hide?id=20131692&goto=news">hide</a> | <a href="item?id=20131692">54 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20136093'>
          <td align="right" valign="top" class="title"><span class="rank">27.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20136093' href='vote?id=20136093&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="item?id=20136093" class="storylink">Ask HN: Do all VPN's suck?</a></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20136093">5 points</span> by <a href="user?id=mrsmee89" class="hnuser">mrsmee89</a> <span class="age"><a href="item?id=20136093">1 hour ago</a></span> <span id="unv_20136093"></span> | <a href="hide?id=20136093&goto=news">hide</a> | <a href="item?id=20136093">3 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20130015'>
          <td align="right" valign="top" class="title"><span class="rank">28.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20130015' href='vote?id=20130015&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://techcrunch.com/2019/06/07/make-magazine-maker-media-layoffs/" class="storylink">Maker Faire halts operations and lays off all staff</a><span class="sitebit comhead"> (<a href="from?site=techcrunch.com"><span class="sitestr">techcrunch.com</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20130015">637 points</span> by <a href="user?id=sohkamyung" class="hnuser">sohkamyung</a> <span class="age"><a href="item?id=20130015">1 day ago</a></span> <span id="unv_20130015"></span> | <a href="hide?id=20130015&goto=news">hide</a> | <a href="item?id=20130015">292 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20132575'>
          <td align="right" valign="top" class="title"><span class="rank">29.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20132575' href='vote?id=20132575&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://phys.org/news/2019-06-usa-lags-eu-brazil-china.html" class="storylink">USA lags behind EU, Brazil and China in banning harmful pesticides</a><span class="sitebit comhead"> (<a href="from?site=phys.org"><span class="sitestr">phys.org</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20132575">170 points</span> by <a href="user?id=QuickToBan" class="hnuser">QuickToBan</a> <span class="age"><a href="item?id=20132575">11 hours ago</a></span> <span id="unv_20132575"></span> | <a href="hide?id=20132575&goto=news">hide</a> | <a href="item?id=20132575">57 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                    <tr class='athing' id='20134171'>
          <td align="right" valign="top" class="title"><span class="rank">30.</span></td>      <td valign="top" class="votelinks"><center><a id='up_20134171' href='vote?id=20134171&how=up&goto=news'><div class='votearrow' title='upvote'></div></a></center></td><td class="title"><a href="https://www.economist.com/finance-and-economics/2019/06/06/advertising-may-make-people-miserable-but-it-still-has-its-uses" class="storylink">Advertising may make people miserable, but it still has its uses</a><span class="sitebit comhead"> (<a href="from?site=economist.com"><span class="sitestr">economist.com</span></a>)</span></td></tr><tr><td colspan="2"></td><td class="subtext">
            <span class="score" id="score_20134171">37 points</span> by <a href="user?id=jkuria" class="hnuser">jkuria</a> <span class="age"><a href="item?id=20134171">7 hours ago</a></span> <span id="unv_20134171"></span> | <a href="hide?id=20134171&goto=news">hide</a> | <a href="item?id=20134171">62 comments</a>              </td></tr>
          <tr class="spacer" style="height:5px"></tr>
                <tr class="morespace" style="height:10px"></tr><tr><td colspan="2"></td><td class="title"><a href="news?p=2" class="morelink" rel="next">More</a></td></tr>
      </table>
    </td></tr>
    <tr><td><img src="s.gif" height="10" width="0"><table width="100%" cellspacing="0" cellpadding="1"><tr><td bgcolor="#ff6600"></td></tr></table><br><center><span class="yclinks"><a href="newsguidelines.html">Guidelines</a>
            | <a href="newsfaq.html">FAQ</a>
            | <a href="mailto:hn@ycombinator.com">Support</a>
            | <a href="https://github.com/HackerNews/API">API</a>
            | <a href="security.html">Security</a>
            | <a href="lists">Lists</a>
            | <a href="bookmarklet.html" rel="nofollow">Bookmarklet</a>
            | <a href="http://www.ycombinator.com/legal/">Legal</a>
            | <a href="http://www.ycombinator.com/apply/">Apply to YC</a>
            | <a href="mailto:hn@ycombinator.com">Contact</a></span><br><br><form method="get" action="//hn.algolia.com/">Search:
              <input type="text" name="q" value="" size="17" autocorrect="off" spellcheck="false" autocapitalize="off" autocomplete="false"></form>
                </center></td></tr>
          </table></center></body><script type='text/javascript' src='hn.js?1UVKBKKrr3Dybiy50LkU'></script>
      </html>
import re

# Remove HTML tags using RegEx
pattern = re.compile(r'<.*?>')  # tags look like <...>
print(pattern.sub('', r.text))  # replace them with blank
        Hacker News

                  Hacker News
              new | past | comments | ask | show | jobs | submit            
                              login

      1.      Huawei Accused of Technology Theft (npr.org)
        85 points by tooltalk 1 hour ago  | hide | 20 comments              

      2.      Gaudí’s Sagrada Familia gets building permit after 137 years (apnews.com)
        98 points by antigizmo 4 hours ago  | hide | 21 comments              

      3.      Google Research Football: A Novel Reinforcement Learning Environment (googleblog.com)
        32 points by haditab 2 hours ago  | hide | 9 comments              

      4.      The clever cryptography behind Apple’s “Find My” feature (wired.com)
        150 points by nnx 6 hours ago  | hide | 51 comments              

      5.      Zanzibar: Consistent, Global Authorization System (ai.google)
        280 points by acjohnson55 11 hours ago  | hide | 94 comments              

      6.      iOS 13 now shows you a map of where apps have been tracking you (9to5mac.com)
        92 points by notlukesky 2 hours ago  | hide | 16 comments              

      7.      New Evidence of Age Bias in Hiring, and a Push to Fight It (nytimes.com)
        271 points by howard941 8 hours ago  | hide | 240 comments              

      8.      Chip design drastically reduces energy needed to compute with light (mit.edu)
        95 points by rbanffy 8 hours ago  | hide | 16 comments              

            More

Guidelines
        | FAQ
        | Support
        | API
        | Security
        | Lists
        | Bookmarklet
        | Legal
        | Apply to YC
        | ContactSearch:
from bs4 import BeautifulSoup

# Remove HTML tags using Beautiful Soup library
soup = BeautifulSoup(r.text, "html5lib")
print(soup.get_text())
        Hacker News

                  Hacker News
              new | past | comments | ask | show | jobs | submit            
                              login

      1.      Huawei Accused of Technology Theft (npr.org)
        85 points by tooltalk 1 hour ago  | hide | 20 comments              

      2.      Gaudí’s Sagrada Familia gets building permit after 137 years (apnews.com)
        98 points by antigizmo 4 hours ago  | hide | 21 comments              

      3.      Google Research Football: A Novel Reinforcement Learning Environment (googleblog.com)
        32 points by haditab 2 hours ago  | hide | 9 comments              

      4.      The clever cryptography behind Apple’s “Find My” feature (wired.com)
        150 points by nnx 6 hours ago  | hide | 51 comments              

      5.      Zanzibar: Consistent, Global Authorization System (ai.google)
        280 points by acjohnson55 11 hours ago  | hide | 94 comments              

      6.      iOS 13 now shows you a map of where apps have been tracking you (9to5mac.com)
        92 points by notlukesky 2 hours ago  | hide | 16 comments              

      7.      New Evidence of Age Bias in Hiring, and a Push to Fight It (nytimes.com)
        271 points by howard941 8 hours ago  | hide | 240 comments              

            More

Guidelines
        | FAQ
        | Support
        | API
        | Security
        | Lists
        | Bookmarklet
        | Legal
        | Apply to YC
        | ContactSearch:
# Find all articles
summaries = soup.find_all("tr", class_="athing")
summaries[0]
<tr class="athing" id="20135938">
      <td align="right" class="title" valign="top"><span class="rank">1.</span></td>      <td class="votelinks" valign="top"><center><a href="vote?id=20135938&how=up&goto=news" id="up_20135938"><div class="votearrow" title="upvote"></div></a></center></td><td class="title"><a class="storylink" href="https://www.npr.org/2019/06/08/730898429/huawei-accused-of-technology-theft">Huawei Accused of Technology Theft</a><span class="sitebit comhead"> (<a href="from?site=npr.org"><span class="sitestr">npr.org</span></a>)</span></td></tr>
# Extract title
summaries[0].find("a", class_="storylink").get_text().strip()
'Huawei Accused of Technology Theft'
# Find all articles, extract titles
articles = []
summaries = soup.find_all("tr", class_="athing")
for summary in summaries:
    title = summary.find("a", class_="storylink").get_text().strip()
    articles.append((title))

print(len(articles), "Article summaries found. Sample:")
print(articles[0])
30 Article summaries found. Sample:
Huawei Accused of Technology Theft

Normalization

Case Normalization

# Sample text
text = "The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?"
print(text)
The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?
# Convert to lowercase
text = text.lower() 
print(text)
the first time you see the second renaissance it may look boring. look at it at least twice and definitely watch part 2. it will change your view of the matrix. are the human people the ones who started the war ? is ai a bad thing ?

Punctuation Removal

import re

# Remove punctuation characters
text = re.sub(r"[^a-zA-Z0-9]", " ", text) 
print(text)
the first time you see the second renaissance it may look boring  look at it at least twice and definitely watch part 2  it will change your view of the matrix  are the human people the ones who started the war   is ai a bad thing  

Tokenization

# Split text into tokens (words)
words = text.split()
print(words)
['the', 'first', 'time', 'you', 'see', 'the', 'second', 'renaissance', 'it', 'may', 'look', 'boring', 'look', 'at', 'it', 'at', 'least', 'twice', 'and', 'definitely', 'watch', 'part', '2', 'it', 'will', 'change', 'your', 'view', 'of', 'the', 'matrix', 'are', 'the', 'human', 'people', 'the', 'ones', 'who', 'started', 'the', 'war', 'is', 'ai', 'a', 'bad', 'thing']

NLTK: Natural Language ToolKit

import os
import nltk
nltk.data.path.append(os.path.join(os.getcwd(), "nltk_data"))
# Another sample text
text = "Dr. Smith graduated from the University of Washington. He later started an analytics firm called Lux, which catered to enterprise customers."
print(text)
Dr. Smith graduated from the University of Washington. He later started an analytics firm called Lux, which catered to enterprise customers.
from nltk.tokenize import word_tokenize

# Split text into words using NLTK
words = word_tokenize(text)
print(words)
['Dr.', 'Smith', 'graduated', 'from', 'the', 'University', 'of', 'Washington', '.', 'He', 'later', 'started', 'an', 'analytics', 'firm', 'called', 'Lux', ',', 'which', 'catered', 'to', 'enterprise', 'customers', '.']
from nltk.tokenize import sent_tokenize

# Split text into sentences
sentences = sent_tokenize(text)
print(sentences)
['Dr. Smith graduated from the University of Washington.', 'He later started an analytics firm called Lux, which catered to enterprise customers.']
# List stop words
from nltk.corpus import stopwords
print(stopwords.words("english"))
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
# Reset text
text = "The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?"

# Normalize it
text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())

# Tokenize it
words = text.split()
print(words)
['the', 'first', 'time', 'you', 'see', 'the', 'second', 'renaissance', 'it', 'may', 'look', 'boring', 'look', 'at', 'it', 'at', 'least', 'twice', 'and', 'definitely', 'watch', 'part', '2', 'it', 'will', 'change', 'your', 'view', 'of', 'the', 'matrix', 'are', 'the', 'human', 'people', 'the', 'ones', 'who', 'started', 'the', 'war', 'is', 'ai', 'a', 'bad', 'thing']
# Remove stop words
words = [w for w in words if w not in stopwords.words("english")]
print(words)
['first', 'time', 'see', 'second', 'renaissance', 'may', 'look', 'boring', 'look', 'least', 'twice', 'definitely', 'watch', 'part', '2', 'change', 'view', 'matrix', 'human', 'people', 'ones', 'started', 'war', 'ai', 'bad', 'thing']

Sentence Parsing

import nltk

# Define a custom grammar
my_grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")
parser = nltk.ChartParser(my_grammar)

# Parse a sentence
sentence = word_tokenize("I shot an elephant in my pajamas")
for tree in parser.parse(sentence):
    print(tree)
(S
  (NP I)
  (VP
    (VP (V shot) (NP (Det an) (N elephant)))
    (PP (P in) (NP (Det my) (N pajamas)))))
(S
  (NP I)
  (VP
    (V shot)
    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))

Stemming & Lemmatization

Stemming

from nltk.stem.porter import PorterStemmer

# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words]
print(stemmed)
['first', 'time', 'see', 'second', 'renaiss', 'may', 'look', 'bore', 'look', 'least', 'twice', 'definit', 'watch', 'part', '2', 'chang', 'view', 'matrix', 'human', 'peopl', 'one', 'start', 'war', 'ai', 'bad', 'thing']

Lemmatization

from nltk.stem.wordnet import WordNetLemmatizer

# Reduce words to their root form
lemmed = [WordNetLemmatizer().lemmatize(w) for w in words]
print(lemmed)
['first', 'time', 'see', 'second', 'renaissance', 'may', 'look', 'boring', 'look', 'least', 'twice', 'definitely', 'watch', 'part', '2', 'change', 'view', 'matrix', 'human', 'people', 'one', 'started', 'war', 'ai', 'bad', 'thing']
# Lemmatize verbs by specifying pos
lemmed = [WordNetLemmatizer().lemmatize(w, pos='v') for w in lemmed]
print(lemmed)
['first', 'time', 'see', 'second', 'renaissance', 'may', 'look', 'bore', 'look', 'least', 'twice', 'definitely', 'watch', 'part', '2', 'change', 'view', 'matrix', 'human', 'people', 'one', 'start', 'war', 'ai', 'bad', 'thing']

为者常成,行者常至